In [120]:
 # This file will use the random forest algorithm to classify the text in the dataset to predict the polarity of the text.
    # The dataset is a collection of reddit post that is manually labeled as positive or negative.

# Import the necessary libraries
# Data Processing
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Modelling 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


[nltk_data] Downloading package stopwords to /Users/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Dataset Preparation

In [121]:
# Import the dataset
R_df = pd.read_csv('../Data/Relavance.csv')
P_df = pd.read_csv('../Data/Polarity.csv')

# # Remove special characters from Title and Data columns such as as ð,Ÿ,˜,â,€,™, and ðŸ˜
R_df['Data'] = R_df['Data'].str.replace('ðŸ˜','')
R_df['Data'] = R_df['Data'].str.replace('ðŸ‘€','')
# Remove ðŸ
P_df['Data'] = P_df['Data'].str.replace('ðŸ˜','')
P_df['Data'] = P_df['Data'].str.replace('ðŸ‘€','')

# # Replace all the NaN values with empty string
R_df['Data'] = R_df['Data'].fillna('')
R_df['Relavance'] = R_df['Relavance'].fillna('')
P_df['Data'] = P_df['Data'].fillna('')    
P_df['Polarity'] = P_df['Polarity'].fillna('')

# Replace \n with empty string
R_df['Data'] = R_df['Data'].str.replace('\n','')
P_df['Data'] = P_df['Data'].str.replace('\n','')

# Flags to run either the relavance or polarity model
relavance = True
polarity = True

# print(df.head())
# print(df.shape)
# print(R_df.info())
# print(R_df.columns.values)

In [122]:
# Preprocessing Function for each data
def preprocess_bows(df, df_size):
    data_soup = BeautifulSoup(df)
    data_text = data_soup.get_text()
    data_letters_only = re.sub("[^a-zA-Z]", " ", data_text).lower()
    data_words = data_letters_only.split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in data_words if not w in stops]

    if((i)%500 == 0):
        print("Cleaned %d %d data (%d %%)." % (i, df_size, (i/df_size)*100))
    return(" ".join(meaningful_words))

In [123]:
# Preprocess the data
R_df_size = R_df['Data'].size
P_df_size = P_df['Data'].size

if relavance:
    for i in range(R_df_size):
        R_df['Data'][i] = preprocess_bows(R_df['Data'][i], R_df_size)
    print("Relavance Data Cleaned")

if polarity:
    for i in range(P_df_size):
        P_df['Data'][i] = preprocess_bows(P_df['Data'][i], P_df_size)
    print("Polarity Data Cleaned")

Cleaned 0 4300 data (0 %).


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  R_df['Data'][i] = preprocess_bows(R_df['Data'][i], R_df_size)


Cleaned 500 4300 data (11 %).
Cleaned 1000 4300 data (23 %).
Cleaned 1500 4300 data (34 %).
Cleaned 2000 4300 data (46 %).
Cleaned 2500 4300 data (58 %).
Cleaned 3000 4300 data (69 %).


In [None]:
# Prepare Training Features

if relavance:
    R_cv = CountVectorizer( analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
    X_R = R_df['Data']
    y_R = R_df['Relavance']

    # Split the dataset into training and testing set
    X_R_train, X_R_test, y_R_train, y_R_test = train_test_split(X_R, y_R, test_size=0.3, random_state=42)
    print("Relavance test split done")
    print(X_R_train.shape, X_R_test.shape, y_R_train.shape, y_R_test.shape)

if polarity:
    P_cv = CountVectorizer( analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
    # X = P_df.drop('Polarity', axis=1)
    X_P = P_df['Data']
    y_P = P_df['Polarity']

    # Split the dataset into training and testing set
    X_P_train, X_P_test, y_P_train, y_P_test = train_test_split(X_P, y_P, test_size=0.3, random_state=42)
    print("Polarity test split done")
    print(X_P_train.shape, X_P_test.shape, y_P_train.shape, y_P_test.shape)

Polarity
(3010,) (1290,) (3010,) (1290,)


In [None]:
# Convert train, validation and test data to vectors
if relavance:
    X_R_train = R_cv.fit_transform(X_R_train)
    X_R_test = R_cv.transform(X_R_test)
    X_R_train = X_R_train.toarray()
    X_R_test = X_R_test.toarray()
if polarity:
    X_P_train = P_cv.fit_transform(X_P_train)
    X_P_test = P_cv.transform(X_P_test)
    X_P_train = X_P_train.toarray()
    X_P_test = X_P_test.toarray()


In [None]:
if relavance:
    vocab_R = R_cv.get_feature_names_out()
    print(vocab_R)
if polarity:
    vocab_P = P_cv.get_feature_names_out()
    print(vocab_P)
    distribution_P = np.sum(X_P_train, axis=0)
    print("Printing first 10 vocab-dist pairs:")
    for tag, count in zip(vocab_P[:10], distribution_P[:10]):
        print(count, tag)

['aaml' 'ab' 'abilities' ... 'zipper' 'zverev' 'zverevs']
Printing first 10 vocab-dist pairs:
5 aaml
4 ab
5 abilities
9 ability
51 able
7 absolu
5 absolueml
10 absoluml
7 absolute
25 absolutely


In [None]:
# # Encode the labels
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y_P_train = le.fit_transform(y_P_train)
# y_P_test = le.fit_transform(y_P_test)

# Fitting and Training the model
if relavance:
    rf_R = RandomForestClassifier()
    rf_R.fit(X_R_train, y_R_train)
    print("Relavance Model Fitted")
if polarity:
    rf_P = RandomForestClassifier()
    rf_P.fit(X_P_train, y_P_train)
    print("Polarity Model Fitted")


In [None]:
# Predicting the test set results
if relavance:
    y_R_pred = rf_R.predict(X_R_test)
    print("Relavance Accuracy: ", accuracy_score(y_R_test, y_R_pred))
if polarity:
    y_P_pred = rf_P.predict(X_P_test)
    print("Polarity Accuracy: ", accuracy_score(y_P_test, y_P_pred))

Polarity Accuracy:  0.6441860465116279
