# Imports

In [None]:
import pandas as pd
import numpy as np

# Data

In [None]:
column_names = ['polarity', 'title', 'text']
df_train = pd.read_csv('C:/Users/Mohit/Desktop/tpx/train.csv',names=column_names, header=None)
df_test = pd.read_csv('C:/Users/Mohit/Desktop/tpx/test.csv',names=column_names, header=None)

# Text Preprocessing

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    if not isinstance(text, str):
        return text
    return " ".join([word for word in text.split() if word.lower() not in stop_words])

In [None]:
import string
exclude=string.punctuation
def remove_punc(text):
  if isinstance(text, str):
    return text.translate(str.maketrans("","",exclude))
  return text

In [None]:
def preprocess(df):
  df['title'] = df['title'].str.lower()
  df['text'] = df['text'].str.lower()
  df['title']=df['title'].apply(remove_punc)
  df['text']=df['text'].apply(remove_punc)
  df['title'] = df['title'].apply(remove_stopwords)
  df['text'] = df['text'].apply(remove_stopwords)
  return df

In [None]:
df_train=preprocess(df_train)
df_test=preprocess(df_test)

# Tokenization + Vector Representation

#### BOW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
X_train=df_train['title']+" "+df_train['text']
y_train=df_train['polarity']
X_test=df_test['title']+" "+df_test['text']
y_test=df_test['polarity']

In [None]:
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]  # keep labels aligned

In [None]:
pipeline1=Pipeline(
    [('vect', CountVectorizer()),
    ('rf', RandomForestClassifier(oob_score=True))
])
pipeline2=Pipeline(
    [('vect', CountVectorizer()),
    ('xgb', XGBClassifier())
])
param_grid = {
    'vect__ngram_range': [(1,1), (1,2), (1,3),(2,2),(3,3),(2,3)],  # unigrams, bigrams, trigrams
    'vect__max_features': [1000,2000,5000,7000,10000,15000,20000,25000,30000]   # optional: limit vocab size
}


In [None]:
pipeline1.fit(X_train,y_train)

In [None]:
pipeline2.fit(X_train,y_train)

In [None]:
grid1 = GridSearchCV(pipeline1, param_grid, cv=5, scoring='accuracy', n_jobs=1, verbose=2)
grid1.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [None]:
grid2 = GridSearchCV(pipeline2, param_grid, cv=5, scoring='accuracy', n_jobs=1, verbose=2)
grid2.fit(X_train, y_train)

# Evaluation

###RandomForest

In [None]:
grid1.best_params_

In [None]:
grid1.best_score_

In [None]:
model_final1=grid1.best_estimator_

In [None]:
y_predict1=pipeline1.predict(X_test)

In [None]:
from sklearn.metrics import accuracy,confusion_matrix,precision,recall,f1_score,classification_report

In [None]:
print("Accuracy : ",accuracy(y_test,y_predict1))
print("Precision : ",precision(y_test,y_predict1))
print("Recall : ",recall(y_test,y_predict1))
print("F1_score : ",f1_score(y_test,y_predict1))
print("Confusion_matrix : ",confusion_matrix(y_test,y_predict1))
print("Oob Score : ",model_final1.oob_score_)

In [None]:
print("Classification Report : ",classification_report(y_test,y_predict1))

### XGBoost

In [None]:
grid2.best_params_

In [None]:
grid2.best_score_

In [None]:
model_final2=grid2.best_estimator_

In [None]:
y_predict2=pipeline2.predict(X_test)

In [None]:
print("Accuracy : ",accuracy(y_test,y_predict2))
print("Precision : ",precision(y_test,y_predict2))
print("Recall : ",recall(y_test,y_predict2))
print("F1_score : ",f1_score(y_test,y_predict2))
print("Confusion_matrix : ",confusion_matrix(y_test,y_predict2))

In [None]:
print("Classification Report : ",classification_report(y_test,y_predict2))