In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno
from collections import Counter

In [None]:
!pip install catboost

In [None]:
#machine learning models
from sklearn.linear_model import LinearRegression,Perceptron
from sklearn.svm import SVR,LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

#model evaluation
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

#hyperparameter tuning
from sklearn.model_selection import GridSearchCV

In [None]:
df=pd.read_csv('/content/train.csv')

In [None]:
df.head(10)

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df["extras"].unique()

In [None]:
df[df["extras"] == "extras"]

In [None]:
df.drop(2579, inplace=True)

In [None]:
df["extras"].unique()

In [None]:
cols = ["result", "toss"]


for col in cols:
    df[col] = (df[col] == "TRUE").astype("int")
df[cols]

In [None]:
dtypes = {x:"" for x in df.columns}
dtypes

In [None]:
dtypes = {
    'year': 'int',
    'toss': 'int',
    'wickets lost': 'float',
    'fours': 'float',
    'sixes': 'float',
    'extras': 'float',
    'run rate ': 'float',
    'average innings strike rate': 'float',
    'highest score': 'int',
    'wickets taken': 'float',
    'given extras': 'float',
    'result': 'int',
    'runs scored': 'int'
}

In [None]:
df.info()

In [None]:
df["highest score"].describe()

In [None]:
for col in df.columns:
  print(col)

In [None]:
df = df.astype(dtypes)

In [None]:
df.info()

In [None]:
sns.pairplot(df)

In [None]:
contigency_tbl=pd.crosstab(df["team name"],df.result)
probability_tbl=contigency_tbl/contigency_tbl.sum()
sns.heatmap(probability_tbl,annot=True,center=0.5,cmap="Greys")

In [None]:
# Use the astype function to convert the data type
df["boundaries"]=(df["sixes"]+df["fours"])

In [None]:
df.head()

In [None]:
df["wickets left"]=11-df["wickets lost"]

In [None]:
df.head()

In [None]:
df.plot.scatter(x='wickets lost', y='runs scored', c='wickets lost')

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
missingno.matrix(df)

In [None]:
categorical_cols = ["team name", "opponent", "host"]
# categorical_cols = []
numeric_cols = ["year", "toss", "wickets taken", "fours", "sixes", "extras", "run rate ", # sneaky
                "average innings strike rate", "highest score", "wickets taken", "given extras", "result","boundaries","wickets left"]

In [None]:
one_hot_encoded_data = pd.get_dummies(df, columns=categorical_cols)
print(one_hot_encoded_data)

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

In [None]:
one_hot_encoded = encoder.fit_transform(df[categorical_cols])

In [None]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_cols))

In [None]:
df_encoded = pd.concat([df, one_hot_df], axis=1)

In [None]:
df_encoded.drop("team name", axis=1, inplace=True)
df_encoded.drop("opponent", axis=1, inplace=True)
df_encoded.drop("host", axis=1, inplace=True)

In [None]:
df_cleaned = df_encoded.dropna(axis=1, how='all')

In [None]:
df['boundaries'].fillna(int(df['boundaries'].mean()), inplace=True)

In [None]:
df['wickets left'].fillna(int(df['wickets left'].mean()), inplace=True)
df['wickets taken'].fillna(int(df['wickets taken'].mean()), inplace=True)

In [None]:
df_cleaned.dropna(subset=['runs scored'], inplace=True)

In [None]:
train_dataset, test_dataset = train_test_split(df_cleaned, test_size=0.2, random_state=10)

In [None]:
def X_y_split(df: pd.DataFrame):
    X = df.drop("runs scored", axis=1)
    y = df["runs scored"]
    return X, y

In [None]:
X_train, y_train = X_y_split(train_dataset)

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
X_test, y_test = X_y_split(test_dataset)

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer object
imputer = SimpleImputer(strategy="mean")

# Fit the imputer on the training data
imputer.fit(X_train)

# Transform the training and test data
X_train = imputer.transform(X_train)
X_train = pd.DataFrame(X_train)
X_test = imputer.transform(X_test)
X_test = pd.DataFrame(X_test)

In [None]:
X_train.isna().sum()

In [None]:
df_cleaned.reset_index(drop=True, inplace=True)

In [None]:
svr = SVR()
svr.fit(X_train, y_train)
y_test_pred = svr.predict(X_test)
print("RMSE: ", mean_squared_error(y_test, y_test_pred, squared=False))

In [None]:
linreg=LinearRegression()
linreg.fit(X_train,y_train)
y_test_pred = linreg.predict(X_test)
print("RMSE: ", mean_squared_error(y_test, y_test_pred, squared=False))

In [None]:
p=Perceptron()
p.fit(X_train,y_train)
y_test_pred = p.predict(X_test)
print("RMSE: ", mean_squared_error(y_test, y_test_pred, squared=False))

In [None]:
linsvr=LinearSVR()
linsvr.fit(X_train,y_train)
Y_pred=linsvr.predict(X_test)
y_test_pred = linsvr.predict(X_test)
print("RMSE: ", mean_squared_error(y_test, y_test_pred, squared=False))

In [None]:
rfg=RandomForestRegressor()
rfg.fit(X_train,y_train)
Y_pred=rfg.predict(X_test)
y_test_pred = rfg.predict(X_test)
print("RMSE: ", mean_squared_error(y_test, y_test_pred, squared=False))

In [None]:
cbr=CatBoostRegressor()
cbr.fit(X_train,y_train)
Y_pred=cbr.predict(X_test)
y_test_pred = cbr.predict(X_test)
print("RMSE: ", mean_squared_error(y_test, y_test_pred, squared=False))

In [None]:
df_cleaned

In [None]:
X, y = X_y_split(df_cleaned)
svr.fit(X_train, y_train)

In [None]:
df_sub = pd.read_csv("/content/test.csv")
df_sub

In [None]:
df_sub.info()

In [None]:
cols = ["result", "toss"]


for col in cols:
    df_sub[col] = (df_sub[col] == "TRUE").astype("int")
df_sub[cols]

In [None]:
df_sub["boundaries"]=(df_sub["sixes"]+df_sub["fours"])

In [None]:
df_sub["wickets left"]=11-df_sub["wickets lost"]

In [None]:
df_sub.head()

In [None]:
one_hot_encoded = encoder.fit_transform(df_sub[categorical_cols])

In [None]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_cols))

In [None]:
df_sub_encoded = pd.concat([df_sub, one_hot_df], axis=1)

In [None]:
df_sub_encoded.info()

In [None]:
columns_df_sub=set(df_sub.columns)
columns_df=set(df_cleaned.columns)
columns_we_need=columns_df-columns_df_sub
columns_sus=columns_df_sub-columns_df

In [None]:
columns_we_need

In [None]:
columns_to_zero=['host_BR','host_CA','host_DE','host_ES','host_FR','host_IT','host_JP','host_US']
df_sub_encoded[columns_to_zero]=0

In [None]:
columns_sus

In [None]:
df_sub_encoded.drop("team name", axis=1, inplace=True)
df_sub_encoded.drop("opponent", axis=1, inplace=True)
df_sub_encoded.drop("host", axis=1, inplace=True)

In [None]:
df_sub.info()

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer object
imputer = SimpleImputer(strategy="mean")

# Fit the imputer on the training data
imputer.fit(df_sub_encoded)

# Transform the training and test data
df_sub = imputer.transform(df_sub_encoded)
df_sub = pd.DataFrame(df_sub)

In [None]:
df_sub.head()

In [None]:
columns_we_need

In [None]:
columns_sus

In [None]:
y_pred = pd.DataFrame()
mp = svr.predict(df_sub)
y_pred['Runs Scored'] = mp
y_pred['Id'] = y_pred.index
y_pred.set_index('Id',inplace=True)

y_pred.to_csv('submission.csv')
y_pred