In [13]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources (run once if not already downloaded)
nltk.download('stopwords')
nltk.download('wordnet')

# Load the enhanced dataset
df = pd.read_csv('enhanced_dataset.csv')  # Replace with your file path
print("Dataset loaded successfully!")
print(df.head())  # Check the first few rows to confirm

Dataset loaded successfully!
   Ball  Ball Number    Bowler       Batsman  Runs  \
0     1          0.1  Jamieson  Rohit Sharma   0.0   
1     2          0.2  Jamieson  Rohit Sharma   6.0   
2     3          0.3  Jamieson  Rohit Sharma   2.0   
3     4          0.4  Jamieson  Rohit Sharma   0.0   
4     5          0.5  Jamieson  Rohit Sharma   0.0   

                                         Description  match_id  is_extra  \
0  Away swing, low bounce, beats the outside edge...         1     False   
1  Pulled behind square. Old-school Rohit, standi...         1     False   
2  Touch too straight, clipped off the pads. Thro...         1     False   
3         Length on fourth, extra bounce. Good leave         1     False   
4  A bit of away movement, that was late swing. L...         1     False   

   runs_for_total  total_runs  ...  overs_bowled  current_run_rate  \
0             0.0         0.0  ...      0.166667               0.0   
1             6.0         6.0  ...      0.333333 

[nltk_data] Downloading package stopwords to /home/nikhil-
[nltk_data]     saxena/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nikhil-
[nltk_data]     saxena/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# Set up stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the Description column
df['cleaned_description'] = df['Description'].apply(preprocess_text)
print("Text preprocessing completed!")
print(df[['Description', 'cleaned_description']].head())

Text preprocessing completed!
                                         Description  \
0  Away swing, low bounce, beats the outside edge...   
1  Pulled behind square. Old-school Rohit, standi...   
2  Touch too straight, clipped off the pads. Thro...   
3         Length on fourth, extra bounce. Good leave   
4  A bit of away movement, that was late swing. L...   

                                 cleaned_description  
0  away swing low bounce beat outside edge landed...  
1  pulled behind square oldschool rohit standing ...  
2              touch straight clipped pad square leg  
3              length fourth extra bounce good leave  
4  bit away movement late swing landed around fif...  


In [15]:
# Example labeled data (expand this with your own labels)
labeled_data = [
    {"description": "Away swing, low bounce, beats the outside edge...", "label": "Good ball"},
    {"description": "Pulled behind square. Old-school Rohit, standing upright, waiting for the shortish ball...", "label": "Short ball"},
    {"description": "Touch too straight, clipped off the pads. Through square leg", "label": "Full ball"},
    {"description": "Length on fourth, extra bounce. Good leave", "label": "Good ball"},
    {"description": "A bit of away movement, that was late swing. Landed around fifth...", "label": "Good ball"},
    {"description": "Wide and full, swing away but it's outside the tramline...", "label": "Wide ball"},
    {"description": "Overpitched on fourth, 135ks, swing away. Driven softly...", "label": "Full ball"},
    {"description": "Three-quarter length on fifth, Gill happy to take a decent front stride...", "label": "Good ball"},
    {"description": "Fullish on fifth, 135ks, defended towards cover", "label": "Full ball"},
    {"description": "Flicked aerially but in the gap to the left of midwicket...", "label": "Full ball"},
    # Add more if needed
]

# Convert to DataFrame
labeled_df = pd.DataFrame(labeled_data)

# Preprocess labeled descriptions
labeled_df['cleaned_description'] = labeled_df['description'].apply(preprocess_text)

# Placeholder: If you have more labeled data in a separate file, load it here
# labeled_df = pd.read_csv('labeled_ball_types.csv')
print("Labeled data prepared!")
print(labeled_df)

Labeled data prepared!
                                         description       label  \
0  Away swing, low bounce, beats the outside edge...   Good ball   
1  Pulled behind square. Old-school Rohit, standi...  Short ball   
2  Touch too straight, clipped off the pads. Thro...   Full ball   
3         Length on fourth, extra bounce. Good leave   Good ball   
4  A bit of away movement, that was late swing. L...   Good ball   
5  Wide and full, swing away but it's outside the...   Wide ball   
6  Overpitched on fourth, 135ks, swing away. Driv...   Full ball   
7  Three-quarter length on fifth, Gill happy to t...   Good ball   
8    Fullish on fifth, 135ks, defended towards cover   Full ball   
9  Flicked aerially but in the gap to the left of...   Full ball   

                                 cleaned_description  
0            away swing low bounce beat outside edge  
1  pulled behind square oldschool rohit standing ...  
2              touch straight clipped pad square leg  
3       

In [16]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the labeled data
X_labeled = vectorizer.fit_transform(labeled_df['cleaned_description'])
y_labeled = labeled_df['label']

# Split into training and testing sets (optional, if you have enough data)
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate (skip if too little data for a meaningful split)
if X_test.shape[0] > 0:
    y_pred = model.predict(X_test)
    print("Model Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
else:
    print("Not enough data to evaluate; proceeding with full training set.")
    model.fit(X_labeled, y_labeled)

Model Accuracy: 0.0
              precision    recall  f1-score   support

   Full ball       0.00      0.00      0.00       1.0
   Good ball       0.00      0.00      0.00       0.0
  Short ball       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Transform the entire dataset's cleaned descriptions using the same vectorizer
X_full = vectorizer.transform(df['cleaned_description'])

# Predict ball types
df['predicted_ball_type'] = model.predict(X_full)

print("Predictions completed!")
print(df[['Ball Number', 'Description', 'predicted_ball_type']].head())

Predictions completed!
   Ball Number                                        Description  \
0          0.1  Away swing, low bounce, beats the outside edge...   
1          0.2  Pulled behind square. Old-school Rohit, standi...   
2          0.3  Touch too straight, clipped off the pads. Thro...   
3          0.4         Length on fourth, extra bounce. Good leave   
4          0.5  A bit of away movement, that was late swing. L...   

  predicted_ball_type  
0           Good ball  
1           Good ball  
2           Full ball  
3           Good ball  
4           Good ball  


In [18]:
# Save to a new CSV
df.to_csv('final_dataset_with_ball_types.csv', index=False)
print("Final dataset saved as 'final_dataset_with_ball_types.csv'")

Final dataset saved as 'final_dataset_with_ball_types.csv'
