In [46]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import csv
import string
from sentistrength import PySentiStr
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import LabelPowerset
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [47]:
dataset_new = pd.read_csv('dataset_new.csv')
dataset = dataset_new

missing_values_check = dataset.isnull().sum()
print(missing_values_check)

Unnamed: 0    0
Review        0
UpVotes       0
Rating        0
Label         0
Length        0
Positive      0
Negative      0
Present       0
Past          0
Future        0
b             0
f             0
r             0
u             0
dtype: int64


In [48]:
categories = list(dataset.columns.values)
categories = categories[11:]
print(categories)
# b for bug_reports, f for feature_requests, r for ratings, u for user_experience

['b', 'f', 'r', 'u']


In [49]:
counts = []
for category in categories:
    counts.append((category, dataset[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of reviews'])
print(df_stats)

  category  number of reviews
0        b                783
1        f                218
2        r                260
3        u                752


In [50]:
train, test = train_test_split(dataset, test_size=0.25, random_state=13, shuffle=True)
train_text = train['Review'].values.astype('U')
test_text = test['Review'].values.astype('U')

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), max_features=15000)
vectorizer.fit(train_text)
vectorizer.fit(test_text)
print(len(vectorizer.get_feature_names()))

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels=['Unnamed: 0', 'UpVotes', 'Rating', 'Label', 'Review', 'Length', 'Positive', 'Negative', 'Present', 'Past', 'Future'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels=['Unnamed: 0', 'UpVotes', 'Rating', 'Label', 'Review', 'Length', 'Positive', 'Negative', 'Present', 'Past', 'Future'], axis=1)

15000


In [9]:
print(type(x_train))
print(type(y_train))
print(type(x_test))
print(type(y_test))
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.frame.DataFrame'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.frame.DataFrame'>
(1509, 15000)
(1509, 4)
(504, 15000)
(504, 4)


# Multi-class classification (only BOW, excluding metadata)

## Multiple Binary Classifications - (One Vs Rest Classifier)

In [19]:
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', max_iter=10000), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} reviews...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing b reviews...**
Test accuracy is 0.7579365079365079


**Processing f reviews...**
Test accuracy is 0.8888888888888888


**Processing r reviews...**
Test accuracy is 0.8650793650793651


**Processing u reviews...**
Test accuracy is 0.6924603174603174




## Multiple Binary Classifications - (Binary Relevance)

In [20]:
# using binary relevance
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.3948412698412698




## Classifier Chains

### Logistic Regression

In [21]:
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression(solver='lbfgs', max_iter=10000))

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test, predictions))
print("\n")

Accuracy =  0.5238095238095238




### DecisionTree Classifier

In [31]:
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(DecisionTreeClassifier(random_state=13))

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test, predictions))
print("\n")

Accuracy =  0.5198412698412699




### RandomForest Classifier

In [36]:
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(RandomForestClassifier())

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test, predictions))
print("\n")



Accuracy =  0.4087301587301587




## Label Powerset

### Logistic Regression

In [22]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression(solver='lbfgs', max_iter=10000))

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")



Accuracy =  0.6130952380952381




### DecisionTree Classifier

In [37]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(DecisionTreeClassifier(random_state=13))

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.49007936507936506




### RandomForest Classifier

In [38]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(RandomForestClassifier())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")



Accuracy =  0.5297619047619048




# Multi-class classification (BOW + metadata)

## Adding metadata to the dataset containing only bag of words

In [23]:
train1 = train.to_numpy()
test1 = test.to_numpy()
x_train1 = x_train.toarray()
x_test1 = x_test.toarray()
y_train1 = y_train.to_numpy()
y_test1 = y_test.to_numpy()

In [24]:
# add the columns: ['Rating', 'Length', 'Positive', 'Negative', 'Present', 'Past', 'Future'] in the dataset
x_train2 = np.hstack((x_train1, train1[:, [3,5,6,7,8,9,10]]))
x_test2 = np.hstack((x_test1, test1[:, [3,5,6,7,8,9,10]]))

x_train2 = x_train2.astype(float)
x_test2 = x_test2.astype(float)

## Multiple Binary Classifications - (One Vs Rest Classifier)

In [29]:
# it is taking a hell lot of time. giving similar accuracy scores as without metadata
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', max_iter=10000), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} reviews...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train2, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test2)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing b reviews...**
Test accuracy is 0.7559523809523809


**Processing f reviews...**
Test accuracy is 0.8888888888888888


**Processing r reviews...**


KeyboardInterrupt: 

## Multiple Binary Classifications - (Binary Relevance)

In [26]:
# using binary relevance
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train2, y_train1)

# predict
predictions = classifier.predict(x_test2)

# accuracy
print("Accuracy = ",accuracy_score(y_test1, predictions))
print("\n")

Accuracy =  0.4107142857142857




## Classifier Chains

### Logistic Regression

In [39]:
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression(solver='lbfgs', max_iter=10000))

# Training logistic regression model on train data
classifier.fit(x_train2, y_train)

# predict
predictions = classifier.predict(x_test2)

# accuracy
print("Accuracy = ",accuracy_score(y_test, predictions))
print("\n")

Accuracy =  0.5773809523809523




### DecisionTree Classifier

In [40]:
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(DecisionTreeClassifier(random_state=13))

# Training logistic regression model on train data
classifier.fit(x_train2, y_train)

# predict
predictions = classifier.predict(x_test2)

# accuracy
print("Accuracy = ",accuracy_score(y_test, predictions))
print("\n")

Accuracy =  0.5257936507936508




### RandomForest Classifier

In [41]:
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(RandomForestClassifier())

# Training logistic regression model on train data
classifier.fit(x_train2, y_train)

# predict
predictions = classifier.predict(x_test2)

# accuracy
print("Accuracy = ",accuracy_score(y_test, predictions))
print("\n")



Accuracy =  0.3333333333333333




## Label Powerset

### Logistic Regression

In [42]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression(solver='lbfgs', max_iter=10000))

# train
classifier.fit(x_train2, y_train)

# predict
predictions = classifier.predict(x_test2)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")



Accuracy =  0.6011904761904762




### DecisionTree Classifier

In [43]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(DecisionTreeClassifier(random_state=13))

# train
classifier.fit(x_train2, y_train)

# predict
predictions = classifier.predict(x_test2)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.5297619047619048




### RandomForest Classifier

In [44]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(RandomForestClassifier())

# train
classifier.fit(x_train2, y_train)

# predict
predictions = classifier.predict(x_test2)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")



Accuracy =  0.5793650793650794




In [45]:
# from the above results, Multiple Binary Classifications - (One Vs Rest Classifier) with only BOW 
# gives best results with average accuracy of 75.976%