In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
df = pd.read_csv('../Dataset/mergedData.csv')
label = df['label']

In [3]:
tf = TfidfVectorizer(ngram_range=(1,2))

In [4]:
tf_vec = tf.fit_transform(df['text'])

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    tf_vec,label, test_size=0.3, shuffle=True, random_state=42,)

In [6]:
parameters = {
    'n_estimators':[100,200,300],
    'max_depth': [70, 80 , 90, 100]
}

In [7]:
model = RandomForestClassifier()

In [8]:
clf = GridSearchCV(model,param_grid=parameters,scoring='accuracy',cv = 2)

In [9]:
clf.fit(tf_vec,label)

In [10]:
clf.best_params_

{'max_depth': 90, 'n_estimators': 300}

In [11]:
model = RandomForestClassifier(n_estimators=100,max_depth=80)

In [12]:
model.fit(x_train,y_train)

In [13]:
cross_val_score(model,tf_vec,label, scoring='accuracy')

array([0.9360519 , 0.93327155, 0.91558442, 0.89332096, 0.73654917])

In [14]:
pred = model.predict(x_test)

In [16]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.51      0.68       402
           1       0.86      1.00      0.93      1216

    accuracy                           0.88      1618
   macro avg       0.93      0.75      0.80      1618
weighted avg       0.90      0.88      0.86      1618



In [17]:
with open('tfidf_tokenizer.pkl','wb') as f:
    pickle.dump(tf,f)

with open('random_forest.pkl', 'wb') as f:
    pickle.dump(model,f)