<a href="https://colab.research.google.com/github/mrunallachake/FakeReview/blob/master/RandomForestXGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import io
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from google.colab import output

In [0]:
nltk.download('stopwords')
from nltk.corpus import stopwords

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':'1x5_EAynaSjZq7m_8vT6tTbrt9QGiBBjD'}) 
downloaded.GetContentFile('amazon_reviews.tsv') 

In [0]:
import pandas as pd
dataset = pd.read_csv('amazon_reviews.tsv', delimiter= '\t', quoting=3) 
X = dataset.iloc[:,[2,3]].values
y = dataset.iloc[:,1].values
display(dataset)

In [0]:
print(type(X))
print(X.shape)
print(X)

In [0]:
#handling categarical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelEncoder_y= LabelEncoder()
y = labelEncoder_y.fit_transform(y)

In [0]:
corpus = []
review_len = []
for i in range(0, 21000):
    review = re.sub('[^a-zA-Z]',' ', dataset['REVIEW_TEXT'][i])
    review = review.lower()
    rev_len = len(review)
    review_len.append(rev_len)
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [0]:
c_corpus = list(corpus)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features = 750)
corpus = cv.fit_transform(corpus)

In [0]:
review_len = np.array(review_len)
review_len = review_len.T
X = np.append(X,corpus.toarray(),axis=1)
X = np.column_stack((X,review_len))
print(X)

In [0]:
X = np.delete(X,1,axis=1)
print(X)

In [0]:
from random import randint
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [0]:
from xgboost import XGBRFClassifier
classifier = XGBRFClassifier(n_estimators = 150, criterion = 'gini', random_state = 0, min_samples_leaf = 3, min_samples_split = 5,
                             max_depth = 15, learning_rate = 0.01, gamma = 0.4, min_child_weight = 0.5, colsample_bytree = 0.3,
                             colsample_bylevel = 0.2)

In [0]:
classifier.fit(X_train, y_train)

In [0]:
y_pred = classifier.predict(X_test)

In [0]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print(accuracy*100)

[[1445  653]
 [ 835 1267]]
64.57142857142857


In [0]:
precision = (cm[0][0])/(cm[0][0]+cm[0][1])
recall = (cm[0][0])/(cm[0][0]+cm[1][0])
print(precision)
print(recall)

0.6887511916110581
0.6337719298245614


In [0]:
 !pip install mlens
 import mlens
 print(mlens.__version__)

In [0]:
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
# Create range of values for parameter
param_range = [0.00001,0.0001,0.001,0.01,0.1,1]

# Calculate accuracy on training and test set using range of parameter values
train_scores, test_scores = validation_curve(XGBRFClassifier(n_estimators = 150, criterion = 'gini', random_state = 0, min_samples_leaf = 3, min_samples_split = 5,
                                             max_depth = 15, learning_rate = 0.01, subsample = 0.5, gamma = 0.4, colsample_bytree =  0.3, colsample_bylevel = 0.2), 
                                             X_train, 
                                             y_train, 
                                             param_name="alpha", 
                                             param_range=param_range,
                                             cv=3, 
                                             scoring="accuracy")

output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

#colsample_bytree =  0.3
#colsample_bylevel = 0.7,0.2
#colsample_bynode = 0.2,0.3

In [0]:
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [0]:
# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="dimgrey")

# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")

# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()

print(test_mean)