## Import Datasets and Packeges

In [116]:
!pip install pandoc

Collecting pandoc
  Downloading pandoc-2.3.tar.gz (33 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting plumbum (from pandoc)
  Downloading plumbum-1.8.3-py3-none-any.whl.metadata (10 kB)
Collecting ply (from pandoc)
  Downloading ply-3.11-py2.py3-none-any.whl.metadata (844 bytes)
Downloading plumbum-1.8.3-py3-none-any.whl (127 kB)
   ---------------------------------------- 0.0/127.6 kB ? eta -:--:--
   ------------ --------------------------- 41.0/127.6 kB 2.0 MB/s eta 0:00:01
   -------------------------------------- - 122.9/127.6 kB 1.2 MB/s eta 0:00:01
   -------------------------------------- 127.6/127.6 kB 938.9 kB/s eta 0:00:00
Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
   --------

DEPRECATION: nb-black 1.0.7 has a non-standard dependency specifier black>='19.3'; python_version >= "3.6". pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of nb-black or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [117]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Suppress specific warning
warnings.filterwarnings("ignore")
import os

In [118]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Load all columns from the CSV file into a DataFrame
df = pd.read_csv("ML df.csv")

In [119]:
# Assuming df is already loaded with the necessary data
X = df[['City', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision','Team1BatAvg',
       'Team2BatAvg', 'Team1BallAvg', 'Team2BallAvg']].copy()
y = df['WinningTeam']

In [120]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Label Encoding

In [121]:
# Encoding only categorical variables
label_encoders = {}
categorical_columns = ['City', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision']

# Encode categorical features
for column in categorical_columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

In [122]:
# Encoding target variable
le_y = LabelEncoder()
y = le_y.fit_transform(y)

## Training and Testing

In [123]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

## Model Fitting

In [124]:

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

## Accuracy Measures

In [125]:
# Logistic Regression
log_reg_train_accuracy = accuracy_score(y_train, log_reg.predict(X_train))
log_reg_test_accuracy = accuracy_score(y_test, y_pred_log_reg)
log_reg_report = classification_report(y_test, y_pred_log_reg)

# K-Nearest Neighbors (KNN)
knn_train_accuracy = accuracy_score(y_train, knn.predict(X_train))
knn_test_accuracy = accuracy_score(y_test, y_pred_knn)
knn_report = classification_report(y_test, y_pred_knn)

# Random Forest
rf_train_accuracy = accuracy_score(y_train, rf.predict(X_train))
rf_test_accuracy = accuracy_score(y_test, y_pred_rf)
rf_report = classification_report(y_test, y_pred_rf)

# Decision Tree
dt_train_accuracy = accuracy_score(y_train, dt.predict(X_train))
dt_test_accuracy = accuracy_score(y_test, y_pred_dt)
dt_report = classification_report(y_test, y_pred_dt)

# Print overall performance metrics
print("Overall Performance Metrics:")
print("Logistic Regression - Training Accuracy:", log_reg_train_accuracy, "Test Accuracy:", log_reg_test_accuracy)
print("K-Nearest Neighbors - Training Accuracy:", knn_train_accuracy, "Test Accuracy:", knn_test_accuracy)
print("Random Forest - Training Accuracy:", rf_train_accuracy, "Test Accuracy:", rf_test_accuracy)
print("Decision Tree - Training Accuracy:", dt_train_accuracy, "Test Accuracy:", dt_test_accuracy)
print("\nClassification Reports:")
print("Logistic Regression:")
print(log_reg_report)
print("K-Nearest Neighbors:")
print(knn_report)
print("Random Forest:")
print(rf_report)
print("Decision Tree:")
print(dt_report)


Overall Performance Metrics:
Logistic Regression - Training Accuracy: 0.3544973544973545 Test Accuracy: 0.2894736842105263
K-Nearest Neighbors - Training Accuracy: 0.49074074074074076 Test Accuracy: 0.2894736842105263
Random Forest - Training Accuracy: 1.0 Test Accuracy: 0.49473684210526314
Decision Tree - Training Accuracy: 1.0 Test Accuracy: 0.42105263157894735

Classification Reports:
Logistic Regression:
              precision    recall  f1-score   support

           0       0.48      0.50      0.49        30
           1       0.14      0.11      0.12        18
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.14      0.15      0.15        20
           6       0.00      0.00      0.00         1
           7       0.36      0.31      0.33        32
           8       0.29      0.24      0.26        21
           9       0.00      0.00      0.00         2
          10       0.25      0.32      0.28    