# Machine Learning for Sentiment Analysis, Tweets Data

In [None]:
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
from tqdm import tqdm_notebook

In [None]:
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

## Load Tweets

In [None]:
df_train = pd.read_csv('./datasets/train_tweets.csv')
df_train.head()

In [None]:
sns.countplot(x=df_train['label'])
plt.title('Racist vs Not-racist Tweets')
plt.savefig('./images/plot-racist-vs-not-racist-tweets')
plt.show()

## Preprocess Tweets

In [None]:
def remove_pattern(text, pattern):
    
    r = re.findall(pattern, text)
    for i in r:
        text = re.sub(i, '', text)
        
    return text

#### 1) Removing twitter handles (@user)

In [None]:
pattern = '@[\w]*'

In [None]:
df_train['cleaned_tweet'] = np.vectorize(remove_pattern)(df_train['tweet'], pattern)
df_train.head()

#### 2) Removing punctuations, numbers and special characters

In [None]:
df_train['cleaned_tweet'] = df_train['cleaned_tweet'].str.replace('[^a-zA-Z#]', ' ')
df_train.head()

#### 3) Removing very short words

In [None]:
min_length = 3

In [None]:
df_train['cleaned_tweet'] = df_train['cleaned_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>min_length]))
df_train.head()

#### 4) Tokenizing cleaned tweets

In [None]:
df_train['tokenized_tweet'] = df_train['cleaned_tweet'].apply(lambda x: x.split())
df_train.head()

#### 4) Stemming tokenized tweets

In [None]:
from nltk.stem.porter import *

In [None]:
stemmer = PorterStemmer()

In [None]:
df_train['tokenized_stemmed_tweet'] = df_train['tokenized_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
df_train.head()

#### 5) Putting the preprocessed result into cleaned tweets

In [None]:
for i in tqdm_notebook(range(len(df_train['tokenized_stemmed_tweet']))):
    df_train['cleaned_tweet'][i] = ' '.join(df_train['tokenized_stemmed_tweet'][i])

In [None]:
df_train.head()

In [None]:
df_train.to_csv('./datasets/train_tweets_preprocessed.csv')

## Explore Cleaned Tweets

#### A) Understanding the common words used in the tweets: WordCloud

In [None]:
from wordcloud import WordCloud

In [None]:
all_words = ' '.join([text for text in df_train['cleaned_tweet']])
word_cloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

In [None]:
plt.figure(figsize=(10, 7))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.title('The Most Common Words of Tweets')
plt.savefig('./images/word-cloud-the-most-common-words')
plt.show()

#### B) Understanding the common words in non racist/sexist tweets

In [None]:
normal_words = ' '.join([text for text in df_train['cleaned_tweet'][df_train['label'] == 0]])
word_cloud = WordCloud(width=800, height=500, max_font_size=110, random_state=21).generate(normal_words)

In [None]:
plt.figure(figsize=(10, 7))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.title('The Most Common Non-Racist Words of Tweets')
plt.savefig('./images/word-cloud-the-most-common-non-racist-words')
plt.show()

#### C) Understanding the common words in racist/sexist tweets

In [None]:
racist_words = ' '.join([text for text in df_train['cleaned_tweet'][df_train['label'] == 1]])
word_cloud = WordCloud(width=800, height=500, max_font_size=110, random_state=21).generate(racist_words)

In [None]:
plt.figure(figsize=(10, 7))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.title('The Most Common Racist Words of Tweets')
plt.savefig('./images/word-cloud-the-most-common-racist-words')
plt.show()

#### D) Understanding the impact of hashtags on tweets sentiment

In [None]:
def extract_hashtags(x):
    
    hashtags = []
    for i in x:
        ht = re.findall(r'#(\w+)', i)
        hashtags.append(ht)
        
    return hashtags

In [None]:
# extracting hashtags from non-racist tweets
HT_positive = extract_hashtags(df_train['cleaned_tweet'][df_train['label'] == 0])
HT_positive = sum(HT_positive,[])
HT_freqdist = nltk.FreqDist(HT_positive)

df_HT = pd.DataFrame({'Hashtags': list(HT_freqdist.keys()),
                      'Count': list(HT_freqdist.values())})

df_HT = df_HT.nlargest(columns='Count', n=10) # select the top 10 most frequent hashtags
plt.figure(figsize=(16,5))
ax = sns.barplot(data=df_HT, x='Hashtags', y='Count')
ax.set(ylabel='Count')
plt.savefig('./images/plot-hastags-of-non-racist-words')
plt.show()

In [None]:
# extracting hashtags from non-racist tweets
HT_negative = extract_hashtags(df_train['cleaned_tweet'][df_train['label'] == 1])
HT_negative = sum(HT_negative,[])
HT_freqdist = nltk.FreqDist(HT_negative)

df_HT = pd.DataFrame({'Hashtags': list(HT_freqdist.keys()),
                      'Count': list(HT_freqdist.values())})

df_HT = df_HT.nlargest(columns='Count', n=10) # select the top 10 most frequent hashtags
plt.figure(figsize=(16,5))
ax = sns.barplot(data=df_HT, x='Hashtags', y='Count')
ax.set(ylabel='Count')
plt.savefig('./images/plot-hastags-of-racist-words')
plt.show()

## Extract Features of Cleaned Tweets

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### A. Bag-of-Words features

In [None]:
bow_vectorizer = CountVectorizer(min_df=2, max_df=0.9, max_features=1000, stop_words='english')
bow_features = bow_vectorizer.fit_transform(df_train['cleaned_tweet'])

#### B. TF-IDF features

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, max_features=1000, stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(df_train['cleaned_tweet'])

## Build Classifier Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

#### A) Building model using Bag-of-Words features

In [None]:
# preparing data from extracted features
x_train, x_test, y_train, y_test = train_test_split(bow_features, df_train['label'], test_size=0.3, random_state=10)

# training the model
lreg_model = LogisticRegression()
lreg_model.fit(x_train, y_train)

# testing the model 
prediction = lreg_model.predict_proba(x_test)
prediction_int = prediction[:,1] >= 0.5 # if prediction is greater than or equal to 0.5 than 1 else 0
prediction_int = prediction_int.astype(np.int)

In [None]:
# calculating f1 score metrics
model_score = f1_score(yvalid, prediction_int)
print(f'Model F1 score: {model_score:.6f}')

#### B) Building model using TF-IDF features

In [None]:
# preparing data from extracted features
x_train, x_test, y_train, y_test = train_test_split(tfidf_features, df_train['label'], test_size=0.3, random_state=10)

# training the model
lreg_model = LogisticRegression()
lreg_model.fit(x_train, y_train)

# testing the model 
prediction = lreg_model.predict_proba(x_test)
prediction_int = prediction[:,1] >= 0.5 # if prediction is greater than or equal to 0.5 than 1 else 0
prediction_int = prediction_int.astype(np.int)

In [None]:
# calculating f1 score metrics
model_score = f1_score(yvalid, prediction_int)
print(f'Model F1 score: {model_score:.6f}')

## Test The Model

In [None]:
df_test = pd.read_csv('./datasets/test_tweets.csv')
df_test.head()

In [None]:
def preprocess_data(df_data, pattern, min_length, save_as):

    df_data['cleaned_tweet'] = np.vectorize(remove_pattern)(df_data['tweet'], pattern)
    df_data['cleaned_tweet'] = df_data['cleaned_tweet'].str.replace('[^a-zA-Z#]', ' ')
    df_data['cleaned_tweet'] = df_data['cleaned_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>min_length]))
    df_data['tokenized_tweet'] = df_data['cleaned_tweet'].apply(lambda x: x.split())
    df_data['tokenized_stemmed_tweet'] = df_data['tokenized_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
    for i in tqdm_notebook(range(len(df_data['tokenized_stemmed_tweet']))): df_data['cleaned_tweet'][i] = ' '.join(df_data['tokenized_stemmed_tweet'][i])
    
    df_data.to_csv(f'./datasets/{save_as}_tweets_preprocessed.csv')
    return df_data

In [None]:
df_test = preprocess_data(df_test, pattern, min_length, 'test')

In [None]:
def extract_data(df_data, ftype='tfidf'):
    
    if ftype == 'bow':
        bow_vectorizer = CountVectorizer(min_df=2, max_df=0.9, max_features=1000, stop_words='english')
        bow_features = bow_vectorizer.fit_transform(df_data['cleaned_tweet'])
        
        return bow_features
    
    elif ftype == 'tfidf':
        tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, max_features=1000, stop_words='english')
        tfidf_features = tfidf_vectorizer.fit_transform(df_data['cleaned_tweet'])
    
        return tfidf_features

In [None]:
test_features = extract_data(df_text)

In [None]:
prediction = lreg_model.predict_proba(test_features)
prediction_int = prediction[:,1] >= 0.5 # if prediction is greater than or equal to 0.5 than 1 else 0
prediction_int = prediction_int.astype(np.int)
df_test['label'] = prediction_int

In [None]:
report = df_test[['id', 'label']]
report.to_csv('./results/report_lreg_model_tfidf.csv', index=False) # writing data to a CSV file

## What's Next? Try Deep Learning!

---