In [None]:
# IMPORTS

# general 
import pandas as pd
import numpy as np
import re

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# gensim
import gensim
from gensim.utils import simple_preprocess

# NLTK
import nltk
from nltk.corpus import stopwords

from collections import Counter
from wordcloud import WordCloud

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline


In [None]:
# IMPORT TWEETS
df = pd.read_csv("path/to/Tweet_update.csv", sep=',')
df = df[['Authors', 'Tweet']]
#df.columns = df.iloc[0]
#df = df[1:]


In [None]:
df.shape

In [None]:
# convert tweets to string
df['Tweet'] = df['Tweet'].apply(str)

In [None]:
# check for duplicates (possible when answering to tweet)
df.drop_duplicates(subset = ['Tweet'], keep='first', inplace=True)
df.shape

In [None]:
# count toal number of charctaers and mean length of a tweet
count = df['Tweet'].str.split().str.len()
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

In [None]:
print('Total number of words:', count.sum(), 'words')
print('Mean number of words per tweet:', round(count.mean(), 2), 'words')

df['tweet_length'] = df['Tweet'].str.len()
print('Total length of dataset is:', df.tweet_length.sum(), 'characters')

print('Mean Length of a tweet is:', round(df.tweet_length.mean(), 0), 'characters')
df = df.drop(['tweet_length'], axis=1)

In [None]:
# Removing Twitter Handles (@user)

def remove_users(tweet, pattern1, pattern2):
    r = re.findall(pattern1, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    r = re.findall(pattern2, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet

In [None]:
df['tidy_tweet'] = np.vectorize(remove_users)(df['Tweet'], '@ [\w]*', '@[\w]*')

In [None]:
# lowercase
df['tidy_tweet'] = df['tidy_tweet'].str.lower()

In [None]:
# Hashtag
df['hashtags'] = df['tidy_tweet'].apply(lambda twt : re.findall(r"#(\w+)", twt))

In [None]:
d = Counter(df.hashtags.sum())
df_hashtags = pd.DataFrame([d]).T

In [None]:
df_hashtags.columns = ['freq']
df_hashtags.freq.sum()

In [None]:
df_hashtags.sort_values(by=['freq'], ascending=False, inplace=True)
df_hashtags = df_hashtags[1:]

In [None]:
# Visualization of Top Hashtags
labels = df_hashtags.head(20).index.values.tolist()
freq = df_hashtags['freq'].head(20).values.tolist()
index = np.arange(len(freq))
print('Among (insert number of tweet), (insert number of hashtags were used.')
plt.figure(figsize=(12, 9))
plt.bar(index, freq, alpha = 0.8, color = 'black')
plt.xlabel('Hashtags', fontsize = 13)
plt.ylabel('Frequency', fontsize = 13)
plt.xticks(index, labels, fontsize = 11, rotation = 90, fontweight = 'bold')
plt.title('Top 20 Hashtags of dataset', fontsize = 12, fontweight = 'bold')
plt.show()


In [None]:
# Remove Hashtag column
df = df.drop(['hashtags'], axis = 1)

In [None]:
# Remove Hashtags from tweets
def remove_hashtags(tweet, pattern1, pattern2):
    r = re.findall(pattern1, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    r = re.findall(pattern2, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet

In [None]:
df['tidy_tweet'] = np.vectorize(remove_hashtags)(df['tidy_tweet'], '# [\w]*', '#[\w]*')

In [None]:
# Remove Links
def remove_links(tweet):
    tweet_no_link = re.sub(r'http\S+', '', tweet)
    return tweet_no_link

In [None]:
df['tidy_tweet'] = np.vectorize(remove_links)(df['tidy_tweet'])

In [None]:
# Removing Punctuations, Numbers, and Special Characters
df['tidy_tweet'] = df['tidy_tweet'].str.replace('[^a-zA-Z#]', ' ') # [^a-zA-Z#] --> non letter

In [None]:
# Remove short words (the word 'not' is removed, think about it) Tweet is a short text, a negative word is more effective 
# in this case, imo --> i remove 'NOT' words
df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
# Tokenize words and clean-up Punctuations
def tokenize(tweet):
    for word in tweet:
        yield(gensim.utils.simple_preprocess(str(word), deacc=True)) # deacc=True removes punctuations

In [None]:
df['tidy_tweet_tokens'] = list(tokenize(df['tidy_tweet']))

In [None]:
# Remove Stopwords and custom stopwords
# prepare stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'https', 'twitter', 'pic', 'twitt'])

In [None]:
def remove_stopwords(tweets):
    return [[word for word in simple_preprocess(str(tweet)) if word not in stop_words] for tweet in tweets]

In [None]:
df['tokens_no_stop'] = remove_stopwords(df['tidy_tweet_tokens'])

In [None]:
# Drop rows having less than 3 tokens
df['length'] = df['tokens_no_stop'].apply(len)
df = df.drop(df[df['length']<3].index)

In [None]:
df = df.drop(['length'], axis = 1)
df.shape

In [None]:
df.reset_index(drop = True, inplace=True)

In [None]:
# WordCloud

# join the tweet back together
def rejoin_words(row):
    words = row['tokens_no_stop']
    joined_words = (' '.join(words))
    return joined_words

In [None]:
df['no_stop_joined'] = df.apply(rejoin_words, axis = 1)

In [None]:
all_words = ' '.join([text for text in df['no_stop_joined']])

In [None]:
wordcloud = WordCloud(width=900, height=600, random_state=21, max_font_size=110, background_color='ghostwhite', max_words=200, colormap='Dark2').generate(all_words)

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Cleaning effects - Length of tweets
df['tweet_length'] = df['Tweet'].str.len()
df['cleaned_tweet_length'] = df['no_stop_joined'].str.len()
df_lengths = df[['tweet_length', 'cleaned_tweet_length']]

In [None]:
x1 = df_lengths.tweet_length
x2 = df_lengths.cleaned_tweet_length
plt.figure(figsize=(15, 6))
plt.suptitle('Length of tweet as number of characters', fontsize = 14, fontweight = 'bold')
plt.subplot(1, 2, 1)
sns.distplot(x1, color='black', label='No. characters', bins=35, hist_kws={'alpha':0.5, 'rwidth':0.8})
plt.title('Original Tweets', fontsize=12, fontweight='bold')
plt.xlabel('Number of characters', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xlim([0, 400])

# Chart 2: Derivative Function
plt.subplot(1, 2, 2)
sns.distplot(x2, color='black', label='No. characters', bins=17, hist_kws={'alpha':0.5, 'rwidth':0.8})
plt.title('Cleaned Tweets', fontsize=12, fontweight='bold')
plt.xlabel('Number of characters', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xlim([0, 400])

plt.show()

In [None]:
df['tweet_words'] = df['Tweet'].str.split().str.len()
df['cleaned_tweet_words'] = df['no_stop_joined'].str.split().str.len()
df_lengths = df[['tweet_words', 'cleaned_tweet_words']]

In [None]:
x_1 = df_lengths.tweet_words
x_2 = df_lengths.cleaned_tweet_words
plt.figure(figsize = (15, 6))
plt.suptitle('Length of tweet as number of words.', fontsize=14, fontweight='bold')
plt.subplot(1, 2, 1)

sns.distplot(x_1, color='black', label='No. Words', bins=25, hist_kws={'alpha':0.5, 'rwidth':0.8})
plt.title('Original Tweets', fontsize=12, fontweight='bold')
plt.xlabel('Number of words', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xlim([0, 70])

# Chart 2: Derivative Function
plt.subplot(1, 2, 2)
sns.distplot(x_2, color='black', label='No. Words', bins=15, hist_kws={'alpha':0.5, 'rwidth':0.8})
plt.title('Cleaned Tweets', fontsize=12, fontweight='bold')
plt.xlabel('Number of words', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xlim([0, 70])

plt.show()

In [None]:
# Top 25 most frequent words
word_freq = pd.Series(np.concatenate([x.split() for x in df.no_stop_joined])).value_counts()

In [None]:
word_df = pd.Series.to_frame(word_freq)
word_df['word'] = list(word_df.index)
word_df.reset_index(drop=True, inplace=True)
word_df.columns = ['freq', 'word']

In [None]:
label = word_df['word'].head(25)
freq = word_df['freq'].head(25)
index = np.arange(len(freq))

print('Unique words:', len(word_df))
plt.figure(figsize=(12, 9))
plt.bar(index, freq, alpha=0.8, color='black')
plt.xlabel('Words', fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.xticks(index, label, fontsize=11, rotation=90, fontweight='bold')
plt.title('Top 25 Words after preprocessing', fontsize=12, fontweight='bold')
plt.show()

In [None]:
# save df
df.to_pickle('path/to/pre-processed.pkl')