# Part 1: Data import and cleaning

In [8]:
#import initial libraries

import pandas as pd
import numpy as np

In [9]:
#import data, retweets already removed using Twarc

df = pd.read_csv("data/nike_NR.csv")

In [10]:
# look at data
df.info()
#130531 Tweets 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130531 entries, 0 to 130530
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            130531 non-null  int64  
 1   tweet_url                     130531 non-null  object 
 2   created_at                    130531 non-null  object 
 3   parsed_created_at             130531 non-null  object 
 4   user_screen_name              130531 non-null  object 
 5   text                          130531 non-null  object 
 6   tweet_type                    130531 non-null  object 
 7   coordinates                   350 non-null     object 
 8   hashtags                      19650 non-null   object 
 9   media                         31251 non-null   object 
 10  urls                          37673 non-null   object 
 11  favorite_count                130531 non-null  int64  
 12  in_reply_to_screen_name       55529 non-null

In [11]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

53 ['en' 'ja' 'und' 'es' 'fr' 'pt' 'pl' 'tl' 'hi' 'ht' 'de' 'in' 'nl' 'tr'
 'ko' 'et' 'ar' 'no' 'fi' 'it' 'is' 'ca' 'sl' 'sv' 'da' 'th' 'fa' 'ru'
 'zh' 'ro' 'cs' 'eu' 'cy' 'hu' 'lv' 'el' 'vi' 'uk' 'lt' 'iw' 'ta' 'sr'
 'mr' 'bg' 'lo' 'hy' 'pa' 'ne' 'am' 'ur' 'ml' 'si' 'te']


In [12]:
# tweets are in 53 different languages

# I'll be working only with Tweets in English
# drop tweets in all other languages
# now working with 58830 Tweets 

df = df[df.lang == 'en']
df.shape

(58830, 37)

In [13]:
# drop unnecessary columns 

df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

In [16]:
# check start time & date of data

df.iloc[0]

# first Tweet 2021-01-13 18:46:48+00:00

id                                                  1349427695499816962
parsed_created_at                             2021-01-13 18:46:48+00:00
user_screen_name                                              DRUGZK1LL
text                  Well sounds to me like ur bf is sick and fucki...
tweet_type                                                        reply
hashtags                                                            NaN
favorite_count                                                        0
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                             1303894143492673537
Name: 0, dtype: object

In [18]:
# check end time & date of data 

df.iloc[-1]

# last Tweet 2021-01-06 23:38:56+00:00

id                                                  1346964499438448642
parsed_created_at                             2021-01-06 23:38:56+00:00
user_screen_name                                            NikeService
text                  @CJ2Blessed Definitely not something we want t...
tweet_type                                                        reply
hashtags                                                            NaN
favorite_count                                                        0
possibly_sensitive                                                False
retweet_count                                                         0
user_id                                                        14462333
Name: 130527, dtype: object

# Part 2: Text processing for NLP 

In [19]:
# create variable for "text" column 
text = df['text'] 

In [20]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers

# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 


stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [21]:
# apply text processing functions to text

df['processed_text'] = df['text'].apply(process_text)

In [22]:
# look at some of processed text

pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  This is separate from the ipykernel package so we can avoid doing imports until


0     [well, sounds, like, ur, bf, sick, fucking, tired, ur, lulu, leggings, birkenstocks, feet, ass, nike, pro, sports, bra, wearing, ass, like, girl, hes, bored, babe, hes, bored]                                                                    
1     [@redbloodedguy, nike, use, slave, labor, china, different, use, slaves, us, early, th, century]                                                                                                                                                   
3     [sneaker, news, alert, nike, waffle, one, set, arrive, sleek, black, #upcomingsneakers, #nikewaffleone, #soledout, #trainers, #sneakers]                                                                                                           
5     [@gosvenerwilliam, @anacabrera, @donwinslow, @nike, fed, lies, antifa, organization, per, se, know, acronym, antifa, means, fascism, know, corrupt, leaders, like, hitler, mussolini, fascists, father, fought, ww, antifa, trump, nature, fascist]


In [23]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [24]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [25]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

[['well',
  'sound',
  'like',
  'ur',
  'bf',
  'sick',
  'fucking',
  'tired',
  'ur',
  'lulu',
  'legging',
  'birkenstocks',
  'foot',
  'as',
  'nike',
  'pro',
  'sport',
  'bra',
  'wearing',
  'as',
  'like',
  'girl',
  'he',
  'bored',
  'babe',
  'he',
  'bored'],
 ['@redbloodedguy',
  'nike',
  'use',
  'slave',
  'labor',
  'china',
  'different',
  'use',
  'slave',
  'u',
  'early',
  'th',
  'century'],
 ['sneaker',
  'news',
  'alert',
  'nike',
  'waffle',
  'one',
  'set',
  'arrive',
  'sleek',
  'black',
  '#upcomingsneakers',
  '#nikewaffleone',
  '#soledout',
  '#trainers',
  '#sneakers'],
 ['@gosvenerwilliam',
  '@anacabrera',
  '@donwinslow',
  '@nike',
  'fed',
  'lie',
  'antifa',
  'organization',
  'per',
  'se',
  'know',
  'acronym',
  'antifa',
  'mean',
  'fascism',
  'know',
  'corrupt',
  'leader',
  'like',
  'hitler',
  'mussolini',
  'fascist',
  'father',
  'fought',
  'ww',
  'antifa',
  'trump',
  'nature',
  'fascist'],
 ['honestly', 'really',

In [26]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


5605    indeed                                                                                                                                                                                                        
5606    everybody hate nike ...                                                                                                                                                                                       
5608    top seller week am ... volt purple use code nike extra                                                                                                                                                        
5610    ad restock nike air force wb wheat                                                                                                                                                                            
5614    @tontolet follow follow back let's mutuals                                                                                          

# Part 3: NMF and LDA topic modeling


In [27]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [28]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)

In [29]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [30]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 30)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 30)

LDA Model:
Topic 0:
[('like', 697.0645948971771), ('got', 430.7047713824012), ('adidas', 425.0214159858603), ('like nike', 348.2705917838978), ('shoe', 330.5008387656179), ('store', 301.42170096487337), ('know', 286.78483715355554), ('blazer', 269.2888279476083), ('right', 261.2167214317633), ('people', 256.8246661676234)]
Topic 1:
[('jordan', 427.23437900640465), ('check', 404.8758103796641), ('added', 390.32706636251146), ('closet', 374.52353336889274), ('poshmarkapp shopmycloset', 370.0107336110526), ('run', 362.87972732961197), ('closet poshmark', 361.22471160861795), ('check added', 361.22469155153453), ('added closet', 361.22468682693886), ('poshmark', 307.0994312034846)]
Topic 2:
[('new', 459.4530627843483), ('shoe', 307.4884984178751), ('need', 307.48577075841627), ('love', 261.59525579742626), ('tech', 254.88088745996964), ('think', 248.6773571937578), ('thank', 234.2630743436685), ('want', 223.5648709242389), ('nike tech', 220.33751616867065), ('balance', 214.0145261232407)]


In [31]:
# make results of LDA2 model a column in dataframe

topic_values = lda_model.transform(doc_term_matrix_1)
df['LDA1_topic'] = topic_values.argmax(axis=1)

In [39]:
df[df.LDA1_topic == 5][:20]

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,possibly_sensitive,retweet_count,user_id,processed_text,pos_tagged,lemmatized,final_docs,LDA1_topic
6,1349427641389084672,2021-01-13 18:46:35+00:00,jojobeans29,Honestly I really do fuck with Nike as much as one can fuck with a brand.,original,,0,,0,3404373975,"[honestly, really, fuck, nike, much, one, fuck, brand]","[(honestly, RB), (really, RB), (fuck, VB), (nike, RB), (much, JJ), (one, CD), (fuck, NN), (brand, NN)]","[honestly, really, fuck, nike, much, one, fuck, brand]",honestly really fuck nike much one fuck brand,5
32,1349427289952550916,2021-01-13 18:45:11+00:00,Gulian74,@Nike gotta disregard the results on the SNKRS app and rerelease the @KingJames Hardwood Classic 7s,original,,0,,0,52489336,"[@nike, gotta, disregard, results, snkrs, app, rerelease, @kingjames, hardwood, classic]","[(@nike, IN), (gotta, NNS), (disregard, JJ), (results, NNS), (snkrs, JJ), (app, JJ), (rerelease, NN), (@kingjames, NNS), (hardwood, VBD), (classic, JJ)]","[@nike, gotta, disregard, result, snkrs, app, rerelease, @kingjames, hardwood, classic]",@nike gotta disregard result snkrs app rerelease @kingjames hardwood classic,5
35,1349427272810459136,2021-01-13 18:45:07+00:00,GQMagazine,Snakeskin Supreme Nikes‚Äîneed we say more? https://t.co/p72omuZnQ6 https://t.co/BqscQomEcD,original,,2,False,0,21701757,"[snakeskin, supreme, nikes, need, say]","[(snakeskin, JJ), (supreme, NN), (nikes, NNS), (need, VBP), (say, VBP)]","[snakeskin, supreme, nike, need, say]",snakeskin supreme nike need say,5
49,1349427097098285056,2021-01-13 18:44:25+00:00,relsquared,Nike just produces such high quality.\nJustifies my purchases errrytine.,original,,0,,0,2314538184,"[nike, produces, high, quality, justifies, purchases, errrytine]","[(nike, JJ), (produces, VBZ), (high, JJ), (quality, NN), (justifies, NNS), (purchases, NNS), (errrytine, VBP)]","[nike, produce, high, quality, justifies, purchase, errrytine]",nike produce high quality justifies purchase errrytine,5
55,1349427015586369537,2021-01-13 18:44:06+00:00,more_sneakers,AD : Nike Air Force 1 'Rayguns'\nLATE DROP via Asphaltgold\n=&gt;https://t.co/2rBiv831gg\n=&gt;https://t.co/2rBiv831gg https://t.co/GFVOMPQ4TZ,original,,3,False,0,1411447658,"[ad, nike, air, force, rayguns, late, drop, via, asphaltgold]","[(ad, NN), (nike, IN), (air, NN), (force, NN), (rayguns, NN), (late, JJ), (drop, NN), (via, IN), (asphaltgold, NN)]","[ad, nike, air, force, rayguns, late, drop, via, asphaltgold]",ad nike air force rayguns late drop via asphaltgold,5
63,1349426968559710208,2021-01-13 18:43:55+00:00,SoleCollector,Your best look yet\n\nSEE MORE: https://t.co/LpG28zkZIs,original,,10,False,1,19382500,"[best, look, yet, see]","[(best, JJS), (look, NN), (yet, RB), (see, VB)]","[best, look, yet, see]",best look yet see,5
84,1349426731413889024,2021-01-13 18:42:58+00:00,williamstoutjr2,@KingSye30886074 @AnnCoulter Nike‚Äôs not good at supporting anyone. Just ask Zion Williamson. https://t.co/gH9amGx8hE,reply,,0,False,0,1223275293219213312,"[@kingsye, @anncoulter, nike, good, supporting, anyone, ask, zion, williamson]","[(@kingsye, JJ), (@anncoulter, NN), (nike, RB), (good, JJ), (supporting, VBG), (anyone, NN), (ask, JJ), (zion, NN), (williamson, NN)]","[@kingsye, @anncoulter, nike, good, supporting, anyone, ask, zion, williamson]",@kingsye @anncoulter nike good supporting anyone ask zion williamson,5
88,1349426667836612608,2021-01-13 18:42:43+00:00,mjostew,nike said just don‚Äôt ‚ô•Ô∏è https://t.co/27zWp44ntr,quote,,0,False,0,294844976,"[nike, said]","[(nike, NN), (said, VBD)]","[nike, said]",nike said,5
131,1349426166772486147,2021-01-13 18:40:43+00:00,outofmySHEL,Nike shouldn‚Äôt be supporting lawmakers. https://t.co/44WXImNyLf,quote,,0,False,0,119654973,"[nike, supporting, lawmakers]","[(nike, IN), (supporting, VBG), (lawmakers, NNS)]","[nike, supporting, lawmaker]",nike supporting lawmaker,5
156,1349425802035789827,2021-01-13 18:39:16+00:00,Alessaaandro99,@CFC_Cal Auba posted this on his main insta at the Nike event in February üëèüèΩ https://t.co/BgCnxpi5Oj,reply,,1,False,0,858653221,"[@cfc_cal, auba, posted, main, insta, nike, event, february]","[(@cfc_cal, JJ), (auba, NN), (posted, VBD), (main, JJ), (insta, NN), (nike, JJ), (event, NN), (february, NN)]","[@cfc_cal, auba, posted, main, insta, nike, event, february]",@cfc_cal auba posted main insta nike event february,5


In [40]:
# cross-validation to find best parameters for LDA model

#import cross-validation tool
from sklearn.model_selection import GridSearchCV

# define search parameters
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# initialize model for cross-validation
lda = LatentDirichletAllocation()

# initialize grid search class 
model = GridSearchCV(lda, param_grid=search_params)

# run grid search 
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [41]:
# what is best model? 
best_lda_model = model.best_estimator_

# print parameters for best model
print("Parameters for best model: ", model.best_params_)

# print log likelihood score
print("Log likelihood score of best model: ", model.best_score_)

# print perplexity score

Parameters for best model:  {'learning_decay': 0.9, 'n_components': 5}
Log likelihood score of best model:  -227422.72278921158
