In [None]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import string
from nltk import FreqDist


In [1]:
df = pd.read_csv('data/judge_1377884607_tweet_product_company.csv')
df.head(-7)

NameError: name 'pd' is not defined

In [2]:
df.info()

NameError: name 'df' is not defined

In [None]:
df.columns

In [None]:
df.emotion_in_tweet_is_directed_at.value_counts()

In [None]:
df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

In [None]:
pos_neg = df[df['is_there_an_emotion_directed_at_a_brand_or_product'] != "I can't tell"]

In [None]:
pos_neg = pos_neg[pos_neg['is_there_an_emotion_directed_at_a_brand_or_product'] != "No emotion toward brand or product"]

In [None]:
pos_neg.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

# Heavily imbalanced datset

### Positive / Negative Model

In [None]:
pos_neg = pos_neg.drop('emotion_in_tweet_is_directed_at', axis=1)
pos_neg = pos_neg.rename(columns={'tweet_text':'text', 'is_there_an_emotion_directed_at_a_brand_or_product':'target'})
pos_neg.head()

In [None]:
def remove_ats_and_hashtags(text):
    entity_prefixes = ['@','#','�']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [None]:
pos_neg['text'] = pos_neg['text'].map(remove_ats_and_hashtags)

In [None]:
pos_neg.replace({'Negative emotion' : 0, 'Positive emotion' : 1}, inplace=True)

In [None]:
pos_neg['text'] = pos_neg['text'].str.lower()

In [None]:
pos_neg

### Word Tokenizer

In [None]:
pos_neg['text_tokenized'] = pos_neg['text'].apply(word_tokenize)

In [None]:
pos_neg

In [None]:
# pos_neg['text_tokenized'].explode()
pos_neg_freq_dist = FreqDist(pos_neg['text_tokenized'].explode())

In [None]:
def visualize_top_20(freq_dist, title):

    # Extract data for plotting
    top_20 = list(zip(*freq_dist.most_common(20)))
    tokens = top_20[0]
    counts = top_20[1]

    # Set up plot and plot data
    fig, ax = plt.subplots(figsize=(20,10))
    ax.bar(tokens, counts)

    # Customize plot appearance
    ax.set_title(title)
    ax.set_ylabel("Count")
#     ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis="x", rotation=90)

In [None]:
visualize_top_20(pos_neg_freq_dist, "Top 20 Word Frequency")

In [None]:
pos_neg.head()

In [None]:
X = pos_neg.drop(['target'], axis=1)
y = pos_neg['target']

In [None]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Build a baseline

In [None]:
tfidf = TfidfVectorizer(max_features=500)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, y_train)

In [None]:
y_train_resampled.value_counts()

In [None]:
pd.DataFrame.sparse.from_spmatrix(X_train_resampled, columns=tfidf.get_feature_names()).info()

In [None]:
# Instantiate a MultinomialNB classifier
baseline_model = MultinomialNB()
baseline_model.fit(X_train_resampled, y_train_resampled)
# Evaluate the classifier on X_train_vectorized and y_train
baseline_cv = cross_val_score(baseline_model, X_train_resampled, y_train_resampled)
baseline_cv

### Remove Stopwords

In [None]:
from nltk.corpus import stopwords

stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
new_stops = ('quot', 'rt', 'i')
stopwords_list += list(new_stops)

def remove_stopwords(token_list):
    """
    Given a list of tokens, return a list where the tokens
    that are also present in stopwords_list have been
    removed
    """
    stops_rmv_list = [token for token in token_list if token not in stopwords_list]
    return stops_rmv_list

X_train["text_tokenized_without_stopwords"] = X_train['text_tokenized'].apply(remove_stopwords)

In [None]:
X_train.head()

In [None]:
tfidf = TfidfVectorizer(max_features=500, stop_words=stopwords_list)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, y_train)

In [None]:
# Instantiate a MultinomialNB classifier
stop_words_removed_model = MultinomialNB()
stop_words_removed_model.fit(X_train_resampled, y_train_resampled)
# Evaluate the classifier on X_train_vectorized and y_train
stop_words_removed_cv = cross_val_score(stop_words_removed_model, X_train_resampled, y_train_resampled)
stop_words_removed_cv

In [None]:
# Run this cell without changes
print("Baseline:         ", baseline_cv.mean())
print("Stopwords removed:", stop_words_removed_cv.mean())

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score



In [None]:
X_test_vectorized = tfidf.fit_transform(X_test['text'])
stop_words_removed_preds = stop_words_removed_model.predict(X_test_vectorized)

print(classification_report(y_test, stop_words_removed_preds))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_5.fit(X_train_resampled, y_train_resampled)
# Evaluate the classifier on X_train_vectorized and y_train
knn_5_cv = cross_val_score(knn, X_train_resampled, y_train_resampled)
knn_5_cv

In [None]:
print("Baseline:         ", baseline_cv.mean())
print("Stopwords removed:", stop_words_removed_cv.mean())
print("KNN (5) :         ", knn_5_cv.mean())

In [None]:
X_test_vectorized = tfidf.fit_transform(X_test['text'])
knn_5_preds = knn_5.predict(X_test_vectorized)

print(classification_report(y_test, knn_5_preds))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(X_train_resampled, y_train_resampled)
# Evaluate the classifier on X_train_vectorized and y_train
knn_3_cv = cross_val_score(knn, X_train_resampled, y_train_resampled)
knn_3_cv

In [None]:
print("Baseline:         ", baseline_cv.mean())
print("Stopwords removed:", stop_words_removed_cv.mean())
print("KNN (5):          ", knn_5_cv.mean())
print("KNN (3):          ", knn_10_cv.mean())

In [None]:
X_test_vectorized = tfidf.fit_transform(X_test['text'])
knn_3_preds = knn_3.predict(X_test_vectorized)

print(classification_report(y_test, knn_3_preds))

In [None]:
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
plot_confusion_matrix(knn_3, X_test_vectorized, y_test, cmap=plt.cm.Reds)
plt.grid(False) # removes the annoying grid lines from plot
plt.show()



In [None]:
print("Baseline:         ", baseline_cv.mean())
print("Stopwords removed:", stop_words_removed_cv.mean())
print("KNN:              ", knn_cv.mean())

In [None]:
pattern = "([a-z]{2,})"
regex_tokenizer = RegexpTokenizer(pattern)

In [None]:
pos_neg['text_regex_tokenized'] = [regex_tokenizer.tokenize(text) for text in pos_neg['text']]

In [None]:
pos_neg.head()

In [None]:
pos_neg['text_regex_tokenized'] = [' '.join(text) for text in pos_neg['text_regex_tokenized']]

In [None]:
pos_neg.head()

In [None]:
X_train['text'] = [regex_tokenizer.tokenize(text) for text in X_train['text']]

In [None]:
X_train.head()

In [None]:
X_train["text"] = X_train['text'].apply(remove_stopwords)

In [None]:
X_train.head()

In [None]:
from nltk.stem import WordNetLemmatizer
lemmer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmer.lemmatize(word) for word in text]

X_train['text'] = X_train['text'].apply(lemmatize_text)

In [None]:
X_train.head()

In [None]:
from nltk.stem import SnowballStemmer

In [None]:
snow_stemmer = SnowballStemmer(language="english")

def snow_stem_text(text):
    return [snow_stemmer.stem(word) for word in text]

X_train['text'] = X_train['text'].apply(snow_stem_text)


In [None]:
X_train.head()

In [None]:
X_train['text'] = [' '.join(text) for text in X_train['text']]

In [None]:
X_train.head()