# Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

In [None]:
data          = pd.read_csv('x_train.csv', error_bad_lines=False)
labels        = pd.read_csv('y_train.csv')
valid_data    = pd.read_csv('valid_data.csv', error_bad_lines=False)
valid_labels  = pd.read_csv('valid_label.csv')

# Training set Preprocessing

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, data.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', data['text'][i])
  review = review.lower()
  review = review.split()
  ps     = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

# Validation Set Preprocessing

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
valid_corpus = []
for i in range(0, valid_data.shape[0]):
  valid_review = re.sub('[^a-zA-Z]', ' ', valid_data['text'][i])
  valid_review = valid_review.lower()
  valid_review = valid_review.split()
  ps     = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  valid_review = [ps.stem(word) for word in valid_review if not word in set(all_stopwords)]
  valid_review = ' '.join(valid_review)
  valid_corpus.append(valid_review)

# Vectorizing using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_v       = TfidfVectorizer()
X          = tf_v.fit_transform(corpus)
y          = labels.iloc[:, -1].values  
X_valid    = tf_v.transform(valid_corpus) 
y_valid    = valid_labels.iloc[:, -1].values

# Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X, y)

# Training the Adaptive Boosting Model on Training set

In [None]:
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier()
classifier.fit(X, y)

# Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X, y)

# Training the Random Forest Classification model on the Training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X, y)

# Training the SVM model on Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', C = 1E5, random_state = 0)
classifier.fit(X, y)

# Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_valid)

# Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_valid, y_pred)
sentiment_review = ['Negetive Review', 'Positive Review']
print(cm)
print(classification_report(y_valid, y_pred, target_names=sentiment_review))
print("Accuracy Score:", accuracy_score(y_valid, y_pred))

# Plotting Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

x_labels = ["Positive","Negetive"]
y_labels = ["Positive","Negetive"]
plt.figure(figsize=(7,5))
sb.heatmap(cm, annot=True, fmt='d', xticklabels=x_labels, yticklabels=y_labels)
plt.title("Test Result Confusion Matrix")
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

plt.show()

# Testing the model

# Importing the libraries

In [None]:
test  = pd.read_csv('test_data.csv', error_bad_lines=False)

# Test Dataset Preprocessing

In [None]:
test_corpus = []
for i in range(0, test.shape[0]):
  test_review = re.sub('[^a-zA-Z]', ' ', test['text'][i])
  test_review = test_review.lower()
  test_review = test_review.split()
  ps         = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  test_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
  test_review = ' '.join(test_review)
  test_corpus.append(test_review)

# Vectorizing using TF-IDF model

In [None]:
X_test  = tf_v.transform(test_corpus)

# Model Prediction Test

In [None]:
test_pred = classifier.predict(X_test)

# Saving Predicted Result into .csv file

In [None]:
test_input = pd.read_csv('test_data.csv')
test_input['predicted'] = test_pred
test_input.to_csv('predict_input.csv', index=False)