In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV
import pickle

In [None]:
df = pd.read_csv('/content/mergedData.csv')
label = df['label']

In [None]:
tf = TfidfVectorizer(ngram_range=(1,2))

In [None]:
tf_vec = tf.fit_transform(df['text'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    tf_vec,label, test_size=0.3, shuffle=True, random_state=42,)

In [None]:
parameters = {
    'n_estimators':[100,200,300],
    'max_depth': [70, 80 , 90, 100]
}

In [None]:
model = RandomForestClassifier()

In [None]:
clf = GridSearchCV(model,param_grid=parameters,scoring='accuracy',cv = 2)

In [None]:
clf.fit(tf_vec,label)

In [None]:
clf.best_params_

{'max_depth': 100, 'n_estimators': 200}

In [None]:
model = RandomForestClassifier(n_estimators=100,max_depth=80)

In [None]:
model.fit(x_train,y_train)

In [None]:
cross_val_score(model,tf_vec,label, scoring='accuracy')

array([0.93512512, 0.93697868, 0.9174397 , 0.89146568, 0.74211503])

In [None]:
pred = model.predict(x_test)

In [None]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.52      0.69       402
           1       0.86      1.00      0.93      1216

    accuracy                           0.88      1618
   macro avg       0.93      0.76      0.81      1618
weighted avg       0.90      0.88      0.87      1618



In [None]:
with open('tfidf_tokenizer.pkl','wb') as f:
    pickle.dump(tf,f)

with open('random_forest.pkl', 'wb') as f:
    pickle.dump(model,f)