# **Importing Dataset and Libraries**

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [10]:
df = pd.read_csv('../input/amazon-alexa-reviews/amazon_alexa.tsv', delimiter = '\t')
df

In [11]:
df.info()

In [15]:
# Drop the 'date' column
df = df.drop(['date'], axis = 1)

# **Exploratory Data Analysis**

In [16]:
df.hist(bins = 30, figsize = (13,5), color = 'r')

In [17]:
#Getting length of each tweet and adding to a new column
df['length'] = df['verified_reviews'].apply(len)

In [18]:
df.describe()

In [54]:
# Plot the histogram of the length column
df['length'].plot(bins = 100, kind = 'hist')

In [23]:
positive = df[tweets_df['feedback'] == 1]
positive

In [24]:
negative = df[df['feedback'] == 0]
negative

In [26]:
#Combining all negative reviews to one to create WordCloud
negative_as_one_string = " ".join(negative['verified_reviews'].tolist())
plt.figure(figsize = (20, 20))
plt.imshow(WordCloud().generate(negative_as_one_string))

In [27]:
#Combining all positive reviews to one to create WordCloud
positive_as_one_string = " ".join(positive['verified_reviews'].tolist())
plt.figure(figsize = (20, 20))
plt.imshow(WordCloud().generate(positive_as_one_string))

# **Data Cleaning**

In [37]:
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
#Removing Punctuationsa and Stopwords
def cleanText(text):
    puncRemoved = ''.join([char for char in text if char not in string.punctuation])
    cleanWords = [word for word in puncRemoved.split() if word.lower() not in stopwords.words('english')]
    return cleanWords

df_clean = df['verified_reviews'].apply(cleanText)

In [41]:
print(df_clean[5])
print(df['verified_reviews'][5])

In [44]:
#Count vectorizer
vectorizer = CountVectorizer(analyzer = cleanText, dtype = np.uint8)
reviewsVectorizer = vectorizer.fit_transform(df['verified_reviews'])

X = pd.DataFrame(reviewsVectorizer.toarray())
y = df['feedback']

In [45]:
X.shape

# **Train & Evaluate Model**

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [49]:
# Predicting the Test set results
y_predict_test = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True)

In [50]:
print(classification_report(y_test, y_predict_test))

In [53]:
#Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True)

print(classification_report(y_test, y_pred))

In [52]:
#Gradient Boosting
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True)

print(classification_report(y_test, y_pred))