# Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset

In [2]:
train_data= pd.read_csv('twitter_training.csv')
test_data= pd.read_csv('twitter_validation.csv')

# Info About The Dataset

In [3]:
# Print information summary of the training DataFrame
display(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


None

In [4]:
# Display the training DataFrame
display(train_data)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


# Rename columns

In [5]:
# Rename columns in the training DataFrame
column_names_train = {'2401': 'id', 'Borderlands': 'country', 'Positive': 'Label',
                      'im getting on borderlands and i will murder you all ,': 'text'}

train_data.rename(columns=column_names_train, inplace=True)

In [6]:
# Rename columns in the test DataFrame
column_names_test = {'3364': 'id', 'Facebook': 'country', 'Irrelevant': 'Label',
                     'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}

test_data.rename(columns=column_names_test, inplace=True)

# Drop Rows With NaN Values

In [7]:
# Drop rows with missing values from the training and test DataFrames
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [8]:
# Load the English language model
nlp = spacy.load("en_core_web_sm") 

# Preprocessing function

In [9]:
# Load English language model for preprocessing
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

# Apply Preprocessing To Data

In [10]:
# Apply preprocessing function to the 'text' column
train_data['Preprocessed Text'] = train_data['text'].apply(preprocess) 
test_data['Preprocessed Text'] = test_data['text'].apply(preprocess) 

In [11]:
# Define features and labels for the training set
X_train = train_data['Preprocessed Text'] 
y_train = train_data['Label']

In [12]:
# Define features and labels for the test set
X_test = test_data['Preprocessed Text']
y_test = test_data['Label']

# Label Encoding

In [13]:
# Initialize LabelEncoder
le = LabelEncoder()

# Encode labels for training set
y_train = le.fit_transform(y_train)

# Encode labels for test set
y_test = le.transform(y_test)

In [14]:
# Vectorization
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data to TF-IDF vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data to TF-IDF vectors
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train The Model

In [15]:
# Train Random Forest Classifier
RFC_model = RandomForestClassifier()

# Train the Random Forest Classifier
RFC_model.fit(X_train_tfidf, y_train)

# Model Predictions

In [16]:
# Predict labels for the test set
y_pred = RFC_model.predict(X_test_tfidf)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
report = classification_report(y_test, y_pred)

In [17]:
# Print the accuracy of the model
print("Accuracy:", accuracy)

Accuracy: 0.9519519519519519


In [18]:
# Print the classification report
print("\nClassification Report \n:", report)


Classification Report 
:               precision    recall  f1-score   support

           0       0.99      0.91      0.95       171
           1       0.95      0.95      0.95       266
           2       0.93      0.97      0.95       285
           3       0.96      0.96      0.96       277

    accuracy                           0.95       999
   macro avg       0.96      0.95      0.95       999
weighted avg       0.95      0.95      0.95       999

