# IPL CHAT BOX

### PROBLEM STATEMENT : 

##### To develop a Q&A chat bot which responds to user's queries based on NLP statistics.

#### .



#### .

In [1]:
import numpy as np
import pandas as pd

# --- NLTK PACKAGE ---
import nltk

# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer, RegexpTokenizer

# Stemming and Lemmatizing
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Stopwords
from nltk.corpus import stopwords, state_union

# Lesk Module
from nltk.wsd import lesk

# Tagger
from nltk.tag import UnigramTagger, BigramTagger, BrillTagger

In [12]:
matches = pd.read_csv('matches.csv')
deliveries = pd.read_csv('deliveries.csv')

In [13]:
matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2
0,1,2008,Bangalore,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Kolkata Knight Riders,140,0,BB McCullum,M Chinnaswamy Stadium,Asad Rauf,RE Koertzen
1,2,2008,Chandigarh,2008-04-19,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,bat,normal,0,Chennai Super Kings,33,0,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",MR Benson,SL Shastri
2,3,2008,Delhi,2008-04-19,Rajasthan Royals,Delhi Daredevils,Rajasthan Royals,bat,normal,0,Delhi Daredevils,0,9,MF Maharoof,Feroz Shah Kotla,Aleem Dar,GA Pratapkumar
3,4,2008,Mumbai,2008-04-20,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,normal,0,Royal Challengers Bangalore,0,5,MV Boucher,Wankhede Stadium,SJ Davis,DJ Harper
4,5,2008,Kolkata,2008-04-20,Deccan Chargers,Kolkata Knight Riders,Deccan Chargers,bat,normal,0,Kolkata Knight Riders,0,5,DJ Hussey,Eden Gardens,BF Bowden,K Hariharan


In [14]:
deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,1,SC Ganguly,BB McCullum,P Kumar,0,...,0,1,0,0,0,1,1,,,
1,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,2,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,0,0,0,,,
2,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,3,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,0,1,1,,,
3,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,4,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,0,0,0,,,
4,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,5,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,0,0,0,,,


# FUNCTIONS

#### BATSMAN STATS

#### Runs scored by Batsman B1 in Match X

In [15]:
def runs_batsman_match(batsman_name, match_id):
    x = deliveries.groupby(['match_id', 'batsman'])['batsman_runs'].sum()
    runs = x[match_id][batsman_name]
    return runs  

In [16]:
runs_batsman_match('SC Ganguly', 1)

10

#### Total Runs scored in Match X by team Y

In [17]:
def total_runs_team_match(team, match_id):
    x = deliveries.groupby(['match_id','batting_team'])['total_runs'].sum()
    total_runs_match = x[match_id][team]
    return total_runs_match  

In [18]:
total_runs_team_match('Royal Challengers Bangalore', 1)

82

#### Max scorer in match X

In [19]:
def max_score_batsman_match(match_id):
    x = deliveries.groupby(['match_id', 'batsman'])['batsman_runs'].sum()
    name = x[match_id].idxmax()
    runs = x[match_id].max()
    return [name, runs]  

In [20]:
max_score_batsman_match(4)

['RV Uthappa', 48]

#### Max scorer in match X in Team Y

In [21]:
def max_score_batsman_match_inTeam(match_id, team):
    x = deliveries.groupby(['match_id','batting_team', 'batsman'])['batsman_runs'].sum()
    name = x[match_id][team].idxmax()
    runs = x[match_id][team].max()
    return [name, runs]  

In [22]:
max_score_batsman_match_inTeam(1, 'Royal Challengers Bangalore')

['P Kumar', 18]

#### Lowest Scorer in Match X

In [23]:
def min_score_batsman_match(match_id):
    x = deliveries.groupby(['match_id', 'batsman'])['batsman_runs'].sum()
    name = x[match_id].idxmin()
    runs = x[match_id].min()
    return [name, runs]  


In [24]:
min_score_batsman_match(1)

['B Akhil', 0]

#### Min scorer in match X in Team Y

In [25]:
def min_score_batsman_match_inTeam(match_id, team):
    x = deliveries.groupby(['match_id','batting_team', 'batsman'])['batsman_runs'].sum()
    name = x[match_id][team].idxmin()
    runs = x[match_id][team].min()
    return [name, runs]  

In [26]:
min_score_batsman_match_inTeam(1, 'Kolkata Knight Riders')

['Mohammad Hafeez', 5]

#### Number of Balls played by Batsman B1 in match Y

In [27]:
def balls_faced_batsman_match(batsman, match_id):
    x = deliveries[(deliveries['batsman'] == batsman) & (deliveries['match_id'] == match_id) & (deliveries['wide_runs'] == 0)]
    return x.shape[0]

In [28]:
balls_faced_batsman_match('SC Ganguly', 1)

12

#### Strike Rate of Batsman B1 in Match X

In [29]:
def strikeRate_batsman_match(batsman, match_id):
    runs = runs_batsman_match(batsman, match_id)
    balls = balls_faced_batsman_match(batsman, match_id)
    
    strike_rate = runs/balls * 100
    return strike_rate

In [30]:
strikeRate_batsman_match('BB McCullum', 1)

216.43835616438358

#### Total Number of Dot Balls played by Batsman B1 in Match X

In [31]:
def dot_balls_batsman_match(batsman, match_id):
    x = deliveries[(deliveries['batsman'] == batsman) & (deliveries['match_id'] == match_id) & (deliveries['total_runs'] == 0)]
    dot_balls = x.shape[0]
    return dot_balls

In [32]:
dot_balls_batsman_match('SC Ganguly', 1)

6

#### Total Number of 4's played by Batsman B1 in Match X

In [33]:
def b_4_batsman_match(batsman, match_id):
    x = deliveries[(deliveries['batsman'] == batsman) & (deliveries['match_id'] == match_id) & (deliveries['total_runs'] == 4)]
    b_4 = x.shape[0]
    return b_4

In [34]:
b_4_batsman_match('SC Ganguly', 1)

2

#### Total Number of 6's played by Batsman B1 in Match X

In [35]:
def b_6_batsman_match(batsman, match_id):
    x = deliveries[(deliveries['batsman'] == batsman) & (deliveries['match_id'] == match_id) & (deliveries['total_runs'] == 6)]
    b_6 = x.shape[0]
    return b_6

In [36]:
b_6_batsman_match('SC Ganguly', 1)

0

#### i^th Highest Scorer

In [37]:
def highest_scorer(i):
    #player_runs = deliveries.groupby('batsman')['batsman_runs'].sum().sort_values(ascending =False)
    player_name = deliveries.groupby('batsman')['batsman_runs'].sum().sort_values(ascending =False).iloc[i:i+1]
    return  player_name

In [38]:
highest_scorer(2)

batsman
ST Jayasuriya    514
Name: batsman_runs, dtype: int64

#### Total Number of fours in Match X by team Y

In [39]:
def team_fours(match_id, batting_team):
    team_fours = deliveries[deliveries.batsman_runs == 4]
    team_fours_count = team_fours.groupby(['match_id','batting_team']).count()['inning']
    return team_fours_count[match_id][batting_team]

In [40]:
team_fours(1,'Kolkata Knight Riders')

15

#### Total Number of sixes in Match X by team Y

In [41]:
def team_sixes(match_id, batting_team):
    team_sixes = deliveries[deliveries.batsman_runs == 6]
    team_sixes_count = team_sixes.groupby(['match_id','batting_team']).count()['inning']
    return team_sixes_count[match_id][batting_team]

In [42]:
team_sixes(1,'Kolkata Knight Riders')

14

## RUNS - ALL MATCH STATS

#### Total Runs scored in Entire IPL by Batsman B1

In [43]:
def total_runs_batsman_IPL(batsman):
    x = deliveries.groupby(['batsman'])['batsman_runs'].sum()
    runs_batsman = x[batsman]
    return runs_batsman

In [44]:
total_runs_batsman_IPL('BB McCullum')

188

#### Total Runs scored in Entire IPL season 01 by team X

In [45]:
def total_runs_team_IPL(team):
    x = deliveries.groupby(['batting_team'])['total_runs'].sum()
    total_runs_IPL = x[team]
    return total_runs_IPL

In [46]:
total_runs_team_IPL('Chennai Super Kings')

2520

#### Orange CAP

In [47]:
def orange_cap():
    x = deliveries.groupby(['batsman'])['batsman_runs'].sum()
    max_scorer = [x.idxmax(), x.max()]
    return max_scorer

In [48]:
orange_cap_name = orange_cap()[0]
orange_cap_total_runs = orange_cap()[1]

print(orange_cap_name, "-->", orange_cap_total_runs)

SE Marsh --> 616


#### Highest Runs scored in an innings by a Batsman

In [49]:
def highest_runs_batsman_innings():
    x = deliveries.groupby(['match_id', 'batsman'])['batsman_runs'].sum().sort_values(ascending=False)
    batsman_and_match = [ x.idxmax(), x.max()]
    
    return batsman_and_match

In [50]:
highest_runs_batsman_innings_NAME = highest_runs_batsman_innings()[0][1]
highest_runs_batsman_innings_MATCH = highest_runs_batsman_innings()[0][0]
highest_runs_batsman_innings_RUNS = highest_runs_batsman_innings()[1]

print(highest_runs_batsman_innings_NAME, " IN MATCH = ", highest_runs_batsman_innings_MATCH, " --->", highest_runs_batsman_innings_RUNS)

BB McCullum  IN MATCH =  1  ---> 158


#### Highest Strike Rate of Batsman Overall

In [51]:
def total_runs_scored_IPL():
    runs_count = deliveries.groupby('batsman')['batsman_runs'].sum()
    return runs_count

def total_ball_faced_IPL():
    balls = deliveries[(deliveries.wide_runs == 0)].groupby('batsman')['inning']
    balls_count = balls.count()
    return balls_count

def total_strike_rate_IPL(i):
    strike_rate = (total_runs_scored_IPL()/total_ball_faced_IPL())*100
    return strike_rate.sort_values(ascending = False).iloc[i:i+1]

In [52]:
total_strike_rate_IPL(0)

batsman
Umar Gul    205.263158
dtype: float64

#### Most ball played by a player in IPL

In [53]:
total_ball_faced_IPL().sort_values(ascending = False).iloc[0:1]

batsman
SE Marsh    441
Name: inning, dtype: int64

#### Total 4's by player X in IPL

In [54]:
def overall_fours_count(batsman):
    fours = deliveries[deliveries.batsman_runs == 4]
    fours_count = fours.groupby('batsman').count()['inning']
    return fours_count[batsman]

In [55]:
overall_fours_count('BB McCullum')

13

#### Total 6's by player X in IPL

In [56]:
def overall_sixes_count(batsman):
    sixes = deliveries[deliveries.batsman_runs == 6]
    sixes_count = sixes.groupby('batsman').count()['inning']
    return sixes_count[batsman]

In [57]:
overall_sixes_count('BB McCullum')

15

#### Maximum 4's by a batsman in IPL

In [58]:
def most_fours_count(i):
    fours = deliveries[deliveries.batsman_runs == 4]
    fours_count = fours.groupby('batsman').count()['inning']
    return fours_count.sort_values(ascending = False).iloc[i:i+1]

In [59]:
for index_val, series_val in most_fours_count(0).iteritems():
        print(index_val, series_val)

G Gambhir 68


#### Maximum 6's by a batsman in IPL

In [60]:
def most_sixes_count():
    sixes = deliveries[deliveries.batsman_runs == 6]
    sixes_count = sixes.groupby('batsman').count()['inning'].sort_values(ascending = False).iloc[0:1]
    return sixes_count

In [61]:
for index_val, series_val in most_sixes_count().iteritems():
        print(index_val, series_val)

ST Jayasuriya 31


### Bowler Stats

#### Wickets taken by a bowler X in match Y

In [62]:
def wickets_by_bowler(bowler, match_id):
    total_wickets = deliveries[(deliveries['match_id'] == match_id) & (deliveries['bowler'] == bowler)].dismissal_kind.count()
    run_outs = deliveries[(deliveries['match_id'] == match_id) & (deliveries['bowler'] == bowler) & (deliveries['dismissal_kind']== 'run out')].dismissal_kind.count()
    return total_wickets-run_outs

In [63]:
wickets_by_bowler('SC Ganguly', 1)

2

#### Runs conceded by a bowler X in match Y

In [64]:
def runs_conceded_by_bowler(bowler, match_id):
    return (deliveries.groupby(['match_id','bowler']).total_runs.sum()-deliveries.groupby(['match_id','bowler']).bye_runs.sum()-deliveries.groupby(['match_id','bowler']).legbye_runs.sum())[match_id][bowler]

In [65]:
runs_conceded_by_bowler('SC Ganguly', 1)

21

#### Number of balls bowled by a bowler X including extras in Match Y

In [66]:
def balls_by_bowler(bowler, match_id):
    return (deliveries.groupby(['match_id','bowler']).ball.agg('count'))[match_id][bowler]

In [67]:
balls_by_bowler('SC Ganguly',1)

25

#### Number of overs bowled by bowler X in match Y

In [68]:
def overs_by_bowler(bowler, match_id):
    balls = (deliveries[(deliveries['match_id']==match_id)& (deliveries['bowler']== bowler) & (deliveries['wide_runs'] == 0) & (deliveries['is_super_over'] == 0) & (deliveries['noball_runs']==0)]).ball.count()
    overs = float(int(balls/6) + float(balls%6)/10)
    return (overs)

In [69]:
overs_by_bowler('LR Shukla', 1)

1.1

#### Number of extra runs conceded by bowler X in match Y (wides, noballs)

In [70]:
def extras_by_bowler(bowler, match_id):
        wide = (deliveries[(deliveries['match_id']==match_id)& (deliveries['bowler']== bowler) & (deliveries['is_super_over'] == 0)]).wide_runs.sum()
        noball = (deliveries[(deliveries['match_id']==match_id)& (deliveries['bowler']== bowler) & (deliveries['is_super_over'] == 0)]).noball_runs.sum()
        byes = (deliveries[(deliveries['match_id']==match_id)& (deliveries['bowler']== bowler) & (deliveries['is_super_over'] == 0)]).bye_runs.sum()
        leg_byes = (deliveries[(deliveries['match_id']==match_id)& (deliveries['bowler']== bowler) & (deliveries['is_super_over'] == 0)]).legbye_runs.sum()
    
        return wide, noball

In [71]:
wide, noball = extras_by_bowler('SC Ganguly', 1)
print("Wides = ",wide," ","Noballs = ", noball)

Wides =  1   Noballs =  0


#### Economy rate of bowler X in match Y

In [72]:
def economy_rate(bowler, match_id):
    import math
    runs = runs_conceded_by_bowler(bowler, match_id)
    overs = overs_by_bowler(bowler, match_id)
    frac, whole = math.modf(overs)
    total = whole + frac*10/6
    return runs/total

In [73]:
economy_rate('SC Ganguly', 1)

5.25

#### Bowler taking highest number of wickets in a match X according to rank Y

In [74]:
def highest_wickets_by_bowler(match_id, rank = 1):
    bowlers = deliveries[deliveries['match_id'] == match_id].bowler.unique()
    wick_by_bowler = {}
    for bowler in bowlers:
        wick_by_bowler[bowler] = (wickets_by_bowler(bowler, match_id))
    data = sorted(wick_by_bowler.items(), key=lambda x:x[1], reverse=True)
    return data[rank-1]
    

In [75]:
highest_wickets_by_bowler(58,1)

('YK Pathan', 3)

#### Bowler having highest economy rate in a match X according to rank Y

In [76]:
def highest_economy_rate(match_id, rank = 1):
    bowlers = deliveries[deliveries['match_id'] == match_id].bowler.unique()
    eco_by_bowler = {}
    for bowler in bowlers:
        eco_by_bowler[bowler] = round((economy_rate(bowler, match_id)),2)
    data = sorted(eco_by_bowler.items(), key=lambda x:x[1])
    return data[rank-1]
    

In [77]:
highest_economy_rate(22)

('IK Pathan', 4.5)

#### Highest Number of wickets by bowler

In [78]:
def overall_wickets_by_bowler(bowler=None,rank =1):
    bowler_wicket =[]
    if (bowler == None): 
        bowlers = deliveries.bowler.unique()
        for bowler in bowlers:
            total_wickets = deliveries[deliveries['bowler'] == bowler].dismissal_kind.count()
            run_outs = deliveries[(deliveries['bowler'] == bowler) & (deliveries['dismissal_kind']== 'run out')].dismissal_kind.count()
            bowler_wicket.append((bowler, (total_wickets - run_outs)))
    else:
        total_wickets = deliveries[deliveries['bowler'] == bowler].dismissal_kind.count()
        run_outs = deliveries[(deliveries['bowler'] == bowler) & (deliveries['dismissal_kind']== 'run out')].dismissal_kind.count()
        bowler_wicket.append((bowler, (total_wickets - run_outs)))
    bowler_wicket = sorted(bowler_wicket, key=lambda x: x[1], reverse=True)
    return  bowler_wicket[rank-1]

In [79]:
overall_wickets_by_bowler()

('Sohail Tanvir', 22)

#### Purple Cap

In [80]:
def purple_cap():
    print("Purple Cap: ", overall_wickets_by_bowler(rank = 1))

In [81]:
purple_cap()

Purple Cap:  ('Sohail Tanvir', 22)


In [82]:
import pandas as pd
import nltk

In [202]:
def feature_extractor(sentence):
    features = {'highest': 0
                ,'scored': 0
                ,'runs': 0
                ,'scorer': 0
                ,'score':0
                ,'match':0
                ,'wickets': 0
                ,'boundary': 0
                ,'boundaries':0
                ,'fours':0
                ,'4s':0
                ,'six':0
                ,'sixes':0
                ,'6s':0
                ,'hit':0
                ,'Fours':0
                ,'aggregate':0
                ,'total':0
                ,'team':0
                ,'lead':0
                ,'leading':0
                ,'maximum':0
                ,'max':0
                ,'minimum':0
                ,'min':0
                ,'least':0
                ,'less':0
                ,'1st':0,'2nd':0,'3rd':0,'4th':0,'5th':0,'6th':0,'7th':0,'8th':0,'9th':0,'10th':0    
                ,'dot':0
                ,'dots':0
                ,'faced':0 
                ,'entire':0
                ,'whole':0
                ,'season':0
                ,'balls':0
                ,'deliveries':0
                ,'faced':0
                
               }
    
    tokenized_sentence = nltk.word_tokenize(sentence)
    word_counts = nltk.Counter(tokenized_sentence)
    for word in word_counts:
        if word in features:
            features[word] = word_counts[word]
    return features

In [203]:
train_sentences = [('Total runs scored by SC Ganguly in match 5?' ,'runs'),
                    ('SC Ganguly score in match 1?','runs'),
                    ('how many runs did Ganguly score in match 2?','runs'),
                    ("Sachin's score in 4th match?", 'runs'),
                    ('McCullum scored how much in match 3?','runs'),
                    ("Dravid's runs in match 2?",'runs'),
                    ("Dravid's score in match 5?",'runs'),
                    ('How much did RCB Score in Match 1?','runs'),
                    ('RCB Score in match 1?','runs'),
                    ('RCB runs in match 1','runs'),
                    ('Team score of rcb in match 4','runs'),
                    ('total score of DDin 3rd match','runs'),
                    ('totals runs of Deccan in 8th match','runs'),
                    ('final score of DD in 3rd match','runs'),
                    ('Rahul maximum score in match 3','max_runs'),
                    ("who was the leading run scorer in match 9", 'max_runs'),
                    ('highest score in match 3','max_runs'),
                    ('maximum scorer of match 2','max_runs'),
                    ("who scored maximum runs in match 6", 'max_runs'),
                    ("who was the top scorer in match 6", 'max_runs'), 
                    ('minimum score in match 3', 'min_runs'),
                    ('lowest score in match 3','min_runs'),
                    ('minimumscorer of match 2','min_runs'),
                    ('who scored less runs in match 3?','min_runs'),
                    ('who was the least run scorer in match 3?','min_runs'),
                    ("who scored the maximum run match 3 by DD",'max_runs'),
                    ("top scorer for DD in match 3",'max_runs'),
                    ("top score in DD match 4?",'max_runs'),
                    ("highest scorer for DD in match 4",'max_runs'),
                    ("who scored the minimum run match 3 by DD",'min_runs'),
                    ("least scorer for DD in match 3",'min_runs'),
                    ("lowest score in DD match 4?",'min_runs'),
                    ("low scorer for DD in match 4",'min_runs'),
                    ("total runs scored by sachin?",'total_runs'),
                    ("aggregate runs by Abhay?",'total_runs'),
                    ("how many runs has sachin made in ipl1?",'total_runs'),
                    ("Total runs made by sachin?",'total_runs'),
                    ("What is the total runs made by RCB?",'total_runs'),
                    ("what is the runs made by RCB in ipl 9?",'total_runs'),
                    ("What is the sum total runs made by RCB",'total_runs'),
                    ("how many fours did Kohli score in match 2?",'fours'),
                    ("4s hit by Sachin in match 1?",'fours'),
                    ("Fours hit by Abhay in match 5?",'fours'),
                    ("how many sixes did Kohli score in match 2?",'sixes'),
                    ("6s hit by Sachin in match 1?",'sixes'),
                    ("Sixes hit by Abhay in match 5?",'sixes'),
                    ("how many sixes did Kohli hit in match 2?",'sixes'),
                    ("who was 3rd highest scorer in match 3?",'ith_highest_scorer'),
                    ("who was 3rd top scorer in match 4?",'ith_highest_scorer'),
                    ("who was 2nd maximum run scorer in match 6?",'ith_highest_scorer'), 
                    ("rahul faced how many dot balls in match 1",'dot_balls'),
                    ("rahul faced how many dots in match 1",'dot_balls'),
                    ("how many boundaries are scored by rcb in match 6",'fours'),
                    ("how many 4s are scored by rcb in match 3",'fours'),
                    ("how many fours are scored by rcb in match 6",'fours'),
                    ("how many boundaries are hit by rcb in match 6",'fours'),
                    ("how many 4s are hit by rcb in match 6",'fours'),
                    ("how many fours are hit by rcb in match 6",'fours'),
                    ("what are total boundaries of rcb in match 6",'fours'),
                    ("what are total fours of rcb in match 6",'fours'),
                    ("what are total 4s of rcb in match 5",'fours'),   
                    ("how many 6s are scored by rcb in match 6",'sixes'),
                    ("how many sixes are scored by rcb in match 6",'sixes'),
                    ("how many 6s are hit by rcb in match 6",'sixes'),
                    ("how many sixes are hit by rcb in match 6",'sixes'),
                    ("what are total sixes of rcb in match 6",'sixes'),
                    ("what are total 6s of rcb in match 5",'sixes'),
                    ("how many fours are hit by kohli in this season?",'fours'),
                    ("how many 4s are hit by kohli in this season?",'fours'),
                    ("how many boundaries are hit by kohli in this season?",'fours'),
                    ("how many fours are scored by kohli in this season?",'fours'),
                    ("how many 4s are scored by kohli in this season?",'fours'),
                    ("how many boundaries are hit by kohli in this season?",'fours'),
                    ("how many boundaries are hit by kohli in this entire ipl?",'fours'),
                    ("how many 4s are hit by kohli in this entire ipl?",'fours'),
                    ("how many fours are hit by kohli in this entire ipl?",'fours'),
                    ("how many boundaries are hit by kohli in this whole ipl?",'fours'),
                    ("how many 4s are hit by kohli in this whole ipl?",'fours'),
                    ("how many fours are hit by kohli in this whole ipl?",'fours'),
                    ("how many sixes are hit by kohli in this season?",'sixes'),
                    ("how many 6s are hit by kohli in this season?",'sixes'),
                    ("how many sixes are scored by kohli in this season?",'sixes'),
                    ("how many 6s are scored by kohli in this season?",'sixes'),
                    ("how many 6s are hit by kohli in this entire ipl?",'sixes'),
                    ("how many sixes are hit by kohli in this entire ipl?",'sixes'),
                    ("how many 6s are hit by kohli in this whole ipl?",'sixes'),
                    ("how many sixes are hit by kohli in this whole ipl?",'sixes'),
                    ("who scored most fours in entire ipl?",'fours'),
                    ("who hit most fours in entire ipl?",'fours'),
                    ("who scored most 4s in entire ipl?",'fours'),
                    ("who hit most 4s in entire ipl?",'fours'), 
                    ("who scored most boundaries in entire ipl?",'fours'),
                    ("who hit most boundaries in entire ipl?",'fours'), 
                    ("who scored most fours in whole ipl?",'fours'),
                    ("who hit most fours in whole ipl?",'fours'),
                    ("who scored most 4s in whole ipl?",'fours'),
                    ("who hit most 4s in whole ipl?",'fours'), 
                    ("who scored most boundaries in whole ipl?",'fours'),
                    ("who hit most boundaries in whole ipl?",'fours'), 
                    ("who scored most fours in this season?",'fours'),
                    ("who hit most fours in this season?",'fours'),
                    ("who scored most 4s in this season?",'fours'),
                    ("who hit most 4s in this season?",'fours'), 
                    ("who scored most boundaries in this season?",'fours'),
                    ("who hit most boundaries in this season?",'fours'), 
                    ("who scored most sixes in entire ipl?",'sixes'),
                    ("who hit most sixes in entire ipl?",'sixes'),
                    ("who scored most 6s in entire ipl?",'sixes'),
                    ("who hit most 6s in entire ipl?",'sixes'), 
                    ("who scored most sixes in whole ipl?",'sixes'),
                    ("who hit most sixes in whole ipl?",'sixes'),
                    ("who scored most 6s in whole ipl?",'sixes'),
                    ("who hit most 6s in whole ipl?",'sixes'), 
                    ("who scored most sixes in this season?",'sixes'),
                    ("who hit most sixes in this season?",'sixes'),
                    ("who scored most 6s in this season?",'sixes'),
                    ("who hit most 6s in this season?",'sixes'), 
                   
                    ("how many balls were faced by BB McCullum in match 3?",'strike_rate'),
                    ("how many deliveries were faced by BB Mccullum in match 3?",'strike_rate'), 
                   
                    ("what was the strike rate of BB McCullum in match 1?",'strike rate'),
                    ("BB McCullum's strike rate in 1st match?",'strike rate'),
 
                    ("Who are leading run scorers in this season?",'runs'),
                    ("who are leading run scorers in this ipl?",'runs'),
                    ("who are leading run scorers in this total ipl?",'runs'), 
                    ("who are leading run scorers in this entire ipl?",'runs'),
                    ("Who are top run scorers in this season?",'runs'),
                    ("who are top run scorers in this ipl?",'runs')
                    ("who are top run scorers in this total ipl?",'runs'), 
                    ("who are top run scorers in this entire ipl?",'runs')                    
                    
                  ]

In [204]:
naive_bayes_classifier = nltk.NaiveBayesClassifier.train([(feature_extractor(sentence), label) for sentence, label in train_sentences])

In [227]:
user_query = "how many runs did Delhi Daredevils made in 4th match?"
feature_set_another_sentence = feature_extractor(user_query)

In [228]:
classifier = naive_bayes_classifier.classify(feature_set_another_sentence)
classifier

'runs'

In [194]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [195]:
stop_words = set(stopwords.words("english"))
#print(stop_words)
words = word_tokenize(user_query)
words_filter = []
for w in words:
    if w not in stop_words:
        #w = w.lower()
        words_filter.append(w)
words_filter      

['many', '4s', 'scored', 'Kolkata', 'Knight', 'Riders', '4th', 'match', '?']

In [196]:
boundary_list = ['4s','6s','sixes','fours']
words_lemmatize = []
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
for w in words_filter:
    if w in boundary_list:
        words_lemmatize.append(w)
    else:
        s = lemmatizer.lemmatize(w)
        words_lemmatize.append(s)
        
    
words_lemmatize    

['many', '4s', 'scored', 'Kolkata', 'Knight', 'Riders', '4th', 'match', '?']

In [197]:
words_tagged = nltk.pos_tag(words_lemmatize)
words_tagged

[('many', 'JJ'),
 ('4s', 'CD'),
 ('scored', 'VBD'),
 ('Kolkata', 'NNP'),
 ('Knight', 'NNP'),
 ('Riders', 'NNP'),
 ('4th', 'CD'),
 ('match', 'NN'),
 ('?', '.')]

In [198]:
list_of_teams = matches.team1.unique()
list_of_teams

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Deccan Chargers', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils'], dtype=object)

In [199]:
chunkGram = r"""Chunk:{<NNP?>*<NNP>}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(words_tagged)
chunked.draw()

In [200]:
for node in chunked:
    if hasattr(node, "label"):
        words = [word for word, tag in node.leaves()]
        chunked_words = ' '.join(words)
chunked_words


'Kolkata Knight Riders'

In [201]:
if(classifier == 'fours'):
    s = 0 
    for team_name in list_of_teams:
        if(chunked_words == team_name):
            s = 1
            team = team_name
    if(s == 1):
        for j in range(len(chunked)):
            if(chunked[j][1] == 'CD'):
                flag = 0
                for p in boundary_list:
                    if(chunked[j][0] == p):
                        flag = 1
                if(flag == 0):
                    match_id = chunked[j][0]  
                    print(match_id)
        #print(total_runs_team_match(team, int(match_id)))  
    else:
        person_name = chunked_words
        for j in range(len(chunked)):
            if(chunked[j][1] == 'CD'):
                match_id = chunked[j][0]
        print(runs_batsman_match(person_name, int(match_id)))        

4th
