In [54]:
import pandas as pd 
dataset = pd.read_csv('Train.csv')
grouped = dataset.groupby('label')

# Select 2000 random reviews from each category
dataset = pd.concat([g.sample(2000, random_state=42) for _, g in grouped])

In [55]:
import re
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')

In [56]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

corpus = []
for i in range(0, 4000):
    review = re.sub('[^a-zA-Z]', ' ', dataset.iloc[i, dataset.columns.get_loc('text')])
    review = review.lower()
    review = review.split()
    lemmatizer = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

    #changed from stemming to lemmatization to make sure more interpretable results are derived

In [57]:
#print(corpus)

In [58]:
#using TF-IDF for vectorization instead of bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [60]:
import lightgbm as lgb

In [61]:
classifier = lgb.LGBMClassifier(max_depth=20, n_estimators=25, min_child_weight= 0.0016, n_jobs=-1)

In [62]:
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)


from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 79.88 %
Standard Deviation: 1.59 %


In [65]:
import pickle

# Save the model
with open('model.pkl', 'wb') as f:
    pickle.dump(classifier, f)


In [67]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(cv, f)