In [37]:
# Data Handling
import pandas as pd
import datetime
import re
import numpy as np
import time

# Graphs
import matplotlib.pyplot as plt

# NLP
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.tokenize import TweetTokenizer
import nltk
from num2words import num2words
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
#Downloads
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
start = time.process_time()

In [40]:

def load_data(team, number):
    df = pd.read_excel('C:/Users/Marc/Dropbox/06_ESCP/01_Uni/06_MA Thesis/04_Code/02_Output/02_Tweets/'+ team + '/02_Tweets/01_Final/' + team + '_final_tweets.xlsx', nrows = number)
    return df

In [41]:
# List of contractions to replace cleaing the data
contractions = {
"ain't": "am not / are not",
"pt": "point",
"fgs" : "fieldgoals",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
} # List of contracti

In [42]:
def cleaning_tweets_tb(df_short):
    # Clearing Text (getting rid of RT = Retweets, punctuation and more)
    df_try = pd.DataFrame(df_short.full_text)
    df_try['before'] = df_try.full_text

    # Create functions that clean the text (right order)
    add_space_after_period = lambda x : re.sub("\.", ". ", x)
    harmonize_apo = lambda x : re.sub("’", "'", x)
    remove_links = lambda x : re.sub('(https?:\/\/[-\/\da-z\.\sA-Z0-9]+)', '', x) #remove link - always last in a tweet
    remove_rt = lambda x : re.sub('(RT\s@[\w]+:)'," ", x) # for  # for when it starts with a RT
    remove_at = lambda x: re.sub("(@[\w]+)", "", x) #Remove where there are @ someting
    remove_punctuation = lambda x: re.sub("[^\w]", ' ', x) # remove all other punctuation (that are not words) --> also emojis
    remove_numbers = lambda x: re.sub('[0-9]', '', x)
    remove_amp = lambda x: re.sub("(amp[\s]+)", "", x) #Remove where there are amp   strings
    remove_double_space = lambda x: re.sub('[\s]+', ' ', x)
    lower_case = lambda x: x.lower()

    # Add space after periods, to make sure words are seperate (... .Let's... -> ... .Let us)
    df_try['full_text'] = df_try.full_text.map(add_space_after_period).map(harmonize_apo)

    # expand contracted words let's -> let us
    for tweet_index in range(len(df_try)):
        for word in df_try.full_text.iloc[tweet_index].split():
            if word.lower() in contractions:
                df_try.full_text.iloc[tweet_index] = df_try.full_text.iloc[tweet_index].replace(word, contractions[word.lower()])

    # Apply the removal from above
    df_try['full_text'] = df_try.full_text.map(remove_links).map(remove_rt).map(remove_at).map(remove_punctuation).map(remove_double_space).map(remove_numbers).map(remove_amp).apply(lower_case)

    # rename columns
    df_try = df_try.rename(columns = {'full_text': 'cleaned'})

    return df_try

In [43]:
def preprocess_data(clean_data):
     #classify as string
     data = clean_data['cleaned'].astype(str)

    # iniate tokenizer and lemmatizer
     lemmatizer = nltk.stem.WordNetLemmatizer()
     w_tokenizer =  TweetTokenizer()

    # Function to lemmatize and tokenize at the same time
     def lemmatize_text(text):
        return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]

    # Function to remove punctuation and create list of words
     def remove_empty_spaces(words):
        new_words = []
        for word in words:
          new_word = word
          if new_word != '':
             new_words.append(new_word)
        return new_words

     words = data.apply(lemmatize_text)
     words = words.apply(remove_empty_spaces)


     return pd.DataFrame(words)

In [44]:
def remove_stopwords(df_try):
    # Remove Stopwords and other unwanted words
    stop_words = set(stopwords.words('english'))
    df_try['token_lemma'] = df_try['token_lemma'].apply(lambda x: [item for item in x if item not in stop_words])


    # Create list to remove words associated to city, players and teams names
    nba_teams = ["token_lemma", "u", "buck","v","boston","celtic","warrior", "staple", "nba", "nbapacific", "jaylen", "brown","knicks","miami","ny","heat","losangeleslakers", "knick", "lakersofficial", "losangeles", "lalakers", "b", "angeles", "los", "lakersbasketball","la", "lakers", "Atlanta Hawks", "hawk","celtic","Boston Celtics","net","Brooklyn Nets","hornet","Charlotte Hornets","bulls","Chicago Bulls","cavalier","Cleveland Cavaliers","maverick","Dallas Mavericks","nugget","Denver Nuggets","piston","Detroit Pistons","warrior","Golden State Warriors","rocket","Houston Rockets","pacer","Indiana Pacers","clippers","LA Clippers","LA Lakers","grizzlie", "Memphis Grizzlies","heat","Miami Heat","Milwaukee Bucks", "bucks","timberwolve","Minnesota Timberwolves","pelican","New Orleans Pelicans","kinck","New York Knicks","Oklahoma City Thunder", "thunder","Orlando Magic","Philadelphia Sixers", "sixer","Phoenix Suns", "sun","Portland Trail Blazers", "trail", "blazer","Sacramento Kings", "king","NaN","San Antonio Spurs", "spur","Toronto Raptors", "raptor","Utah Jazz", "jazz","Washington Wizards", "wizard"]
    [x.lower() for x in nba_teams]


    df_try['token_lemma'] = df_try['token_lemma'].apply(lambda x: [item for item in x if item not in nba_teams])


    return df_try

In [45]:
# Load Data
team = 'CHB'
data = load_data(team, 224086)

In [46]:
data.full_text = data.full_text.astype(str)

In [47]:
#Clean Data
cleaned_data = cleaning_tweets_tb(data)

In [48]:
# Run function
preprocessed_tweets = preprocess_data(cleaned_data)

# Append tokenized and lemmatized words back
cleaned_data['token_lemma'] = preprocessed_tweets

# Remove stopwords and other unwanted names such as teams or players
final_df = pd.concat([remove_stopwords(cleaned_data),data['created_at']], axis = 1)

In [49]:
final_df

Unnamed: 0,cleaned,before,token_lemma,created_at
0,kawhi chili leonard nba leader ppg steph cur...,Kawhi Chili' Leonard NBA Leader: 26.0 PPG Step...,"[kawhi, chili, leonard, leader, ppg, steph, cu...",2021-04-13 17:04:40
1,two years ago i started tweeting about chicag...,RT @UKChicagoBulls: Two years ago I started tw...,"[two, year, ago, started, tweeting, chicago, b...",2021-04-13 17:02:36
2,since the trade deadline changed the bulls ros...,Since the trade deadline changed the Bulls ros...,"[since, trade, deadline, changed, bull, roster...",2021-04-13 17:00:26
3,chicago bulls forward thad young said on the ...,RT @ChiSportUpdates: Chicago Bulls forward Tha...,"[chicago, bull, forward, thad, young, said, bu...",2021-04-13 16:59:01
4,the most unbelievable thing about the harry p...,RT @HeathWParker: The most unbelievable thing ...,"[unbelievable, thing, harry, potter, series, t...",2021-04-13 16:44:13
...,...,...,...,...
224081,mark my words he about to be an all star and ...,@Lakers Mark my words he about to be an all-st...,"[mark, word, star, called]",2021-05-12 15:16:10
224082,new episode a look at the race in the west ...,New Episode - A look at the 5-6-7 race in the ...,"[new, episode, look, race, west, try, hold, ca...",2021-05-12 15:16:09
224083,lakers survive with no lebron no true pg agai...,"RT @Sedano: Lakers survive with no LeBron, no ...","[survive, lebron, true, pg, best, league, th, ...",2021-05-12 15:16:09
224084,and a shout out to the taylor swift sub secti...,RT @xKENNANx: and a shout out to the Taylor Sw...,"[shout, taylor, swift, sub, section, twitter, ...",2021-05-12 15:16:05


In [50]:
# Worldcloud
def create_wordcloud(final_df):
    from wordcloud import WordCloud

    #WordCloud
    wc = WordCloud(background_color="white", max_words=3000,repeat=True)
    wc.generate(str(final_df['token_lemma']))
    plt.figure(figsize=(12,10))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()


# TF-idf

# For tweets there are not title, hence no difference weights assigned

def create_df(final_df):
    DF = {}

    for i in range(len(final_df)):

        tokens = final_df['token_lemma'][i]

        for w in tokens:
            try:
                DF[w].add(i)
            except:
                DF[w] = {i}

    for i in DF:
        DF[i] = len(DF[i])

    return DF

#helper function for calc tf_idf
def return_word_freq(word):
    f = 0
    try:
        f = DF[word]
    except:
        pass
    return f

#Calculating tf-idf
#tf = Term frequencyy --> depends on the tweet
#idf = takes whole corpus into account (all tweets)

def calc_tf_idf(final_df):
    tf_idf = {}

    for i in range(len(final_df)):

        tokens = final_df['token_lemma'][i]
        counter = Counter(tokens)
        words_count = len(final_df['token_lemma'][i])

        for token in np.unique(tokens):

            tf = counter[token]/words_count
            df = return_word_freq(token)
            idf = np.log((len(final_df)+1)/(df+1)) #--> by adding one in the nummerator no negative values are possible

            tf_idf[token] = df*idf

    return tf_idf

DF = create_df(final_df)
tf_idf = calc_tf_idf(final_df)

#Vectorising
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tf_idf)

Not done, the transform might work, the tokenization and everything up to the td-idf scores works. However the cosine similiratity is a bit tricky to be calculated.
The problem espeically starts with comparing a new corpus' similarity to an exisitng

# TextBlob and Vader
Vader: https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f

In [51]:
# Add polarity, neg, pos... to the tweets

final_df[['TB_polarity', 'TB_subjectivity']] = final_df['cleaned'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

for index, row in final_df['cleaned'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    final_df.loc[index, 'Va_neg'] = neg
    final_df.loc[index, 'Va_neu'] = neu
    final_df.loc[index, 'Va_pos'] = pos
    final_df.loc[index, 'Va_compound'] = comp
    if neg > pos:
        final_df.loc[index, 'Va_sentiment'] = 'negative'
    elif pos > neg:
        final_df.loc[index, 'Va_sentiment'] = 'positive'
    else:
        final_df.loc[index, 'Va_sentiment'] = 'neutral'


In [52]:
# Use not cleanded dataset (cleaning now done by TB and Vader (Vader can read Emojis))
final_df[['_before_TB_polarity', '_before_TB_subjectivity']] = final_df['before'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

for index, row in final_df['before'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    final_df.loc[index, '_before_Va_neg'] = neg
    final_df.loc[index, '_before_Va_neu'] = neu
    final_df.loc[index, '_before_Va_pos'] = pos
    final_df.loc[index, '_before_Va_compound'] = comp
    if neg > pos:
        final_df.loc[index, '_before_Va_sentiment'] = 'negative'
    elif pos > neg:
        final_df.loc[index, '_before_Va_sentiment'] = 'positive'
    else:
        final_df.loc[index, '_before_Va_sentiment'] = 'neutral'

In [53]:
final_df.to_csv('C:/Users/Marc/Dropbox/06_ESCP/01_Uni/06_MA Thesis/04_Code/02_Output/02_Tweets/'+ team +'/02_Tweets/01_Final/'+ team +'_sentiment_analized.csv')

In [54]:
print(time.process_time() - start)


6042.25


Instruction:
- Change team name in 252 and run
- Check if that is the correct folder name
    - Done for LAL
    - GSW
    - BOC
    - MIH
    - CHB

