In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

In [2]:
import pickle

In [3]:
final_df = pd.read_csv('final.csv',index_col=[0])

In [4]:
final_df = final_df.sample(final_df.shape[0])

In [5]:
eligible_cities = final_df['city'].value_counts()[final_df['city'].value_counts() > 600].index.tolist()

final_df = final_df[final_df['city'].isin(eligible_cities)]

In [29]:
eligible_cities

['Colombo',
 'London',
 'Mirpur',
 'Sydney',
 'Centurion',
 'Melbourne',
 'Abu Dhabi',
 'Rangiri',
 'Johannesburg',
 'Adelaide',
 'Birmingham',
 'Perth',
 'Auckland',
 'Brisbane',
 'Dubai',
 'Karachi',
 'Wellington',
 'Cape Town',
 'Southampton',
 'Lahore',
 'Manchester',
 'Hamilton',
 'Cardiff',
 'Durban',
 'Nottingham',
 'Pallekele',
 'Sharjah',
 'Christchurch',
 'Mumbai',
 'Port Elizabeth',
 'Chandigarh',
 'Hambantota',
 'Dhaka',
 'Leeds',
 'Delhi',
 'Antigua',
 'Chennai',
 'Ahmedabad',
 'Guyana',
 'Napier',
 'Chester-le-Street',
 'Trinidad',
 'St Kitts',
 'St Lucia',
 'Jamaica',
 'Pune',
 'Hobart',
 'Barbados',
 'Chattogram',
 'Hyderabad',
 'Harare',
 'Bloemfontein',
 'Kolkata',
 'Bridgetown',
 'Mount Maunganui',
 'Nagpur',
 'Visakhapatnam',
 'Grenada',
 'Chittagong',
 'Rawalpindi',
 'Dunedin',
 'Jaipur',
 'Lucknow',
 'Bristol',
 'Paarl',
 'Rajkot',
 'Multan',
 'Fatullah',
 'Dambulla',
 'Kandy',
 'Nelson',
 'Amstelveen',
 'Indore',
 'Cuttack',
 'Dharamsala',
 'Canberra',
 'Kanpur',

In [7]:
X = final_df.drop(columns=['match_id','runs_x','runs_y'])
y = final_df['runs_y']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [8]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [25]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1500,learning_rate=0.1,max_depth=15,random_state=42))
])

In [14]:
parameters = {
    'step3__max_depth': range (2, 10, 15),
    'step3__n_estimators': range(500, 1000, 2000),
    'step3__learning_rate': [0.2 , 0.1, 0.01, 0.05]
}

In [26]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))



0.9588811279508871
4.566884782805429


In [27]:
pickle.dump(pipe,open('model.pkl','wb'))

In [15]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,parameters,cv=5,scoring='accuracy')

In [16]:
grid.fit(X_train,y_train)
grid.best_score_

Traceback (most recent call last):
  File "c:\Users\dhair\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dhair\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dhair\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dhair\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_param_validation.py", line 214, in wrapper
    retu

nan

In [17]:
grid.best_params_

{'step3__learning_rate': 0.2,
 'step3__max_depth': 2,
 'step3__n_estimators': 500}