In [51]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import csv
import string
import pickle
from sentistrength import PySentiStr
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import LabelPowerset
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [52]:
dataset_new = pd.read_csv('dataset_new.csv')
dataset = dataset_new

missing_values_check = dataset.isnull().sum()
print(missing_values_check)

Unnamed: 0    0
Review        0
UpVotes       0
Rating        0
Label         0
Length        0
Positive      0
Negative      0
Present       0
Past          0
Future        0
b             0
f             0
r             0
u             0
dtype: int64


In [53]:
categories = list(dataset.columns.values)
categories = categories[11:]
print(categories)
# b for bug_reports, f for feature_requests, r for ratings, u for user_experience

['b', 'f', 'r', 'u']


In [54]:
counts = []
for category in categories:
    counts.append((category, dataset[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of reviews'])
print(df_stats)

  category  number of reviews
0        b                783
1        f                218
2        r                260
3        u                752


In [55]:
train, test = train_test_split(dataset, test_size=0.25, random_state=13, shuffle=True)
train_text = train['Review'].values.astype('U')
test_text = test['Review'].values.astype('U')

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), max_features=15000)
vectorizer.fit(train_text)
vectorizer.fit(test_text)
pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))
print(len(vectorizer.get_feature_names()))

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels=['Unnamed: 0', 'UpVotes', 'Rating', 'Label', 'Review', 'Length', 'Positive', 'Negative', 'Present', 'Past', 'Future'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels=['Unnamed: 0', 'UpVotes', 'Rating', 'Label', 'Review', 'Length', 'Positive', 'Negative', 'Present', 'Past', 'Future'], axis=1)

15000


In [56]:
print(type(x_train))
print(type(y_train))
print(type(x_test))
print(type(y_test))
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.frame.DataFrame'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.frame.DataFrame'>
(1509, 15000)
(1509, 4)
(504, 15000)
(504, 4)


# Multi-class classification (only BOW, excluding metadata)

## Multiple Binary Classifications - (One Vs Rest Classifier)

In [57]:
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', max_iter=10000), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} reviews...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    pickle.dump(LogReg_pipeline, open("classifier_"+ str(category) +".pickle", "wb"))
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing b reviews...**
Test accuracy is 0.7579365079365079


**Processing f reviews...**
Test accuracy is 0.8888888888888888


**Processing r reviews...**
Test accuracy is 0.8650793650793651


**Processing u reviews...**
Test accuracy is 0.6924603174603174


