# Part 1: Data cleaning 

In [1]:
#import initial libraries

import pandas as pd
import numpy as np


In [2]:
#import data 

df = pd.read_csv('data/jack.csv')

In [4]:
# look at basic info about data

df.info()
# this data set consists of 43467 Tweets
# Twarc search for Tweets to @Jack (Twitter CEO Jack Dorsey), in the wake of Twitter's decision to ban Trump from the platform for 24 hours

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43467 entries, 0 to 43466
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            43467 non-null  int64  
 1   tweet_url                     43467 non-null  object 
 2   created_at                    43467 non-null  object 
 3   parsed_created_at             43467 non-null  object 
 4   user_screen_name              43467 non-null  object 
 5   text                          43467 non-null  object 
 6   tweet_type                    43467 non-null  object 
 7   coordinates                   5 non-null      object 
 8   hashtags                      5801 non-null   object 
 9   media                         3173 non-null   object 
 10  urls                          10355 non-null  object 
 11  favorite_count                43467 non-null  int64  
 12  in_reply_to_screen_name       32802 non-null  object 
 13  i

In [5]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

43 ['en' 'und' 'es' 'bg' 'hi' 'it' 'in' 'ar' 'pt' 'et' 'tr' 'tl' 'pl' 'sv'
 'fa' 'fr' 'eu' 'de' 'no' 'ht' 'fi' 'lv' 'ja' 'ta' 'ko' 'vi' 'cy' 'nl'
 'da' 'ro' 'sl' 'ml' 'ru' 'ur' 'hu' 'ca' 'iw' 'bn' 'zh' 'cs' 'sd' 'lt'
 'ne']


In [7]:
# tweets are in 43 different languages

# I'll be working only with Tweets in English
# drop tweets in all other languages
# now working with 35556 Tweets 

df = df[df.lang == 'en']
df.shape

(35556, 37)

In [8]:
# drop unnecessary columns 

df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

In [10]:
# check start time & date of data

df.iloc[0]

# first Tweet downloaded Jan 8, 2021 at 7:08:58

id                                                  1347440141388963841
parsed_created_at                             2021-01-08 07:08:58+00:00
user_screen_name                                        chinweconstanc4
text                  @Jack please bring the @verified back, Erica i...
tweet_type                                                      retweet
hashtags                        WatchEricaOnYoutube WatchEricaOnYoutube
favorite_count                                                        4
possibly_sensitive                                                 True
retweet_count                                                        16
user_id                                             1076486309370384386
Name: 0, dtype: object

In [12]:
# check end time & date of data 

df.iloc[-1]

# Tweets go back as far as Jan 31, 2020 at 22:07:04

id                                                  1344767054298148870
parsed_created_at                             2020-12-31 22:07:04+00:00
user_screen_name                                        SuccessfulErica
text                  @jack @verified please verify @EricaNlewedim a...
tweet_type                                                      retweet
hashtags                                                            NaN
favorite_count                                                       15
possibly_sensitive                                                  NaN
retweet_count                                                        12
user_id                                             1323407186127630337
Name: 43466, dtype: object

# Part 2: Text processing for NLP 

In [14]:
# create variable for "text" column 

text = df['text'] 

In [15]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers
# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [16]:
# apply text processing functions to text

df['processed_text'] = df['text'].apply(process_text)

In [17]:
# look at some of processed text

pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  This is separate from the ipykernel package so we can avoid doing imports until


0     [@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]                                                                                                     
1     [@jack, @verified, please, verify, @ericanlewedim]                                                                                                                                                    
2     [@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]                                                                                                     
6     [@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]                                                                                                     
7     [@jack, common, cuecat, guy, woz, know, hope, sleep, well, mountain, gold]                                                                                                    

In [18]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [19]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [20]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

[['@jack',
  'please',
  'bring',
  '@verified',
  'back',
  'erica',
  'worth',
  '#watchericaonyoutube',
  '#watchericaonyoutube'],
 ['@jack', '@verified', 'please', 'verify', '@ericanlewedim'],
 ['@jack',
  'please',
  'bring',
  '@verified',
  'back',
  'erica',
  'worth',
  '#watchericaonyoutube',
  '#watchericaonyoutube'],
 ['@jack',
  'please',
  'bring',
  '@verified',
  'back',
  'erica',
  'worth',
  '#watchericaonyoutube',
  '#watchericaonyoutube'],
 ['@jack',
  'common',
  'cuecat',
  'guy',
  'woz',
  'know',
  'hope',
  'sleep',
  'well',
  'mountain',
  'gold'],
 ['@jack',
  'god',
  'see',
  'effort',
  'work',
  'changing',
  "people's",
  'life',
  'help',
  "i'm",
  'orphan',
  'working',
  'hard',
  'make',
  'difference',
  'family',
  'society',
  'large',
  'absolutely',
  'need',
  'help',
  'meletusejike@gmail.com'],
 ['@jack',
  'amazing',
  'really',
  'feel',
  '#motherlandmission',
  'talk',
  'ch',
  'pointed',
  'need',
  'supporting',
  'basic',
  'need'

In [21]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


3714    @jack advisor well scavino one                                                                                                                                                                                                    
3715    @jack twitter need protect democracy encourage peaceful transition power like colleague @facebook done although believe banned life two week danger america #bantrump #banhim #peacefultransferofpower                            
3716    @jack please verify @ericanlewedim                                                                                                                                                                                                
3717    @jack pls verify @ericanlewedim @verified great influencer brand ambassador lot brand also award-winning actress please verify #ericatheinfluencer                                                                                
3718    @jack decent human please block                     

In [22]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

# Part 3: run NMF and LDA models, for topic modeling

In [23]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)


In [24]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [25]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)

LDA Model:
Topic 0:
[('twitter', 289.420898143721), ('ban', 246.12047345411372), ('jack time', 235.18943740253638), ('jack twitter', 233.77553197394104), ('time', 220.33850692006303), ('jack ban', 187.70733686051955), ('mazinnamdikanu', 165.0196655468166), ('verify mazinnamdikanu', 145.68065751019677), ('ban trump', 143.3447475260875), ('twitter shut', 125.97440725484387)]
Topic 1:
[('account', 695.58456041917), ('suspend', 675.062884595675), ('trump account', 598.5108708057622), ('trump', 543.186502136906), ('twitter twittersafety', 542.0029336669878), ('time suspend', 540.2541767505044), ('president', 537.5126271785506), ('safety', 528.3778973921864), ('lie', 527.9256121986549), ('incited', 526.2635246761794)]
Topic 2:
[('verify', 685.1424292274853), ('jack verify', 487.9622583842279), ('twittersupport', 389.4757858539096), ('ericanlewedim', 379.3257855040641), ('verify ericanlewedim', 322.60914223126895), ('verified', 285.17832052736156), ('jack twittersupport', 262.42798705665166),

# Part 4: Run visualization and testing of LDA model

In [27]:
# visualization of LDA model 
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [28]:
# test LDA model

# log likelihood (higher score is better)
print("Log likelihood: ", lda_model.score(doc_term_matrix_1))

# perplexity (lower score is better)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log likelihood:  -691028.455410083
Perplexity:  893.6781241405488


In [29]:
# cross-validation to find best parameters for LDA model

#import cross-validation tool
from sklearn.model_selection import GridSearchCV

# define search parameters
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# initialize model for cross-validation
lda = LatentDirichletAllocation()

# initialize grid search class 
model = GridSearchCV(lda, param_grid=search_params)

# run grid search 
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [30]:
# what is best model? 
best_lda_model = model.best_estimator_

# print parameters for best model
print("Parameters for best model: ", model.best_params_)

# print log likelihood score
print("Log likelihood score of best model: ", model.best_score_)

# print perplexity score
print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

Parameters for best model:  {'learning_decay': 0.9, 'n_components': 5}
Log likelihood score of best model:  -156150.2933076067
Perplexity score of best model:  745.0305368649379


In [31]:
# 2nd LDA model: running new "best model" parameters

lda_model_2 = LatentDirichletAllocation(n_components = 5, max_iter=10, learning_method='online', learning_decay=0.5)
lda_Z_2 = lda_model_2.fit_transform(doc_term_matrix_1)

# visualization of 2nd LDA model 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

# Part 5: Additional data analysis

In [32]:
# insert NMF model, LDA model 1, and LDA model 2 results back into dataframe
topic_values = nmf_model.transform(doc_term_matrix_1)
df['NMF_topic'] = topic_values.argmax(axis=1)
df.head(1)

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,possibly_sensitive,retweet_count,user_id,processed_text,pos_tagged,lemmatized,final_docs,NMF_topic
0,1347440141388963841,2021-01-08 07:08:58+00:00,chinweconstanc4,"@Jack please bring the @verified back, Erica is worth it.\n#WatchEricaOnYoutube\n#WatchEricaOnYoutube https://t.co/jSv54rTzcx",retweet,WatchEricaOnYoutube WatchEricaOnYoutube,4,True,16,1076486309370384386,"[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]","[(@jack, NN), (please, NN), (bring, VB), (@verified, VBN), (back, RB), (erica, JJ), (worth, NN), (#watchericaonyoutube, NNP), (#watchericaonyoutube, NN)]","[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]",@jack please bring @verified back erica worth #watchericaonyoutube #watchericaonyoutube,4


In [33]:
topic_values = lda_model.transform(doc_term_matrix_1)
df['LDA1_topic'] = topic_values.argmax(axis=1)
df.head(1)

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,possibly_sensitive,retweet_count,user_id,processed_text,pos_tagged,lemmatized,final_docs,NMF_topic,LDA1_topic
0,1347440141388963841,2021-01-08 07:08:58+00:00,chinweconstanc4,"@Jack please bring the @verified back, Erica is worth it.\n#WatchEricaOnYoutube\n#WatchEricaOnYoutube https://t.co/jSv54rTzcx",retweet,WatchEricaOnYoutube WatchEricaOnYoutube,4,True,16,1076486309370384386,"[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]","[(@jack, NN), (please, NN), (bring, VB), (@verified, VBN), (back, RB), (erica, JJ), (worth, NN), (#watchericaonyoutube, NNP), (#watchericaonyoutube, NN)]","[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]",@jack please bring @verified back erica worth #watchericaonyoutube #watchericaonyoutube,4,0


In [34]:
topic_values = lda_model_2.transform(doc_term_matrix_1)
df['LDA2_topic'] = topic_values.argmax(axis=1)
df.head(20)

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,possibly_sensitive,retweet_count,user_id,processed_text,pos_tagged,lemmatized,final_docs,NMF_topic,LDA1_topic,LDA2_topic
0,1347440141388963841,2021-01-08 07:08:58+00:00,chinweconstanc4,"@Jack please bring the @verified back, Erica is worth it.\n#WatchEricaOnYoutube\n#WatchEricaOnYoutube https://t.co/jSv54rTzcx",retweet,WatchEricaOnYoutube WatchEricaOnYoutube,4,True,16,1076486309370384386,"[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]","[(@jack, NN), (please, NN), (bring, VB), (@verified, VBN), (back, RB), (erica, JJ), (worth, NN), (#watchericaonyoutube, NNP), (#watchericaonyoutube, NN)]","[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]",@jack please bring @verified back erica worth #watchericaonyoutube #watchericaonyoutube,4,0,2
1,1347439976796090369,2021-01-08 07:08:19+00:00,adaobi_ani,@Jack and @verified please verify @EricaNlewedim,retweet,,15,,11,1318431923660795907,"[@jack, @verified, please, verify, @ericanlewedim]","[(@jack, RB), (@verified, JJ), (please, NN), (verify, VB), (@ericanlewedim, NN)]","[@jack, @verified, please, verify, @ericanlewedim]",@jack @verified please verify @ericanlewedim,4,2,3
2,1347439857308729345,2021-01-08 07:07:50+00:00,elitecompany41,"@Jack please bring the @verified back, Erica is worth it.\n#WatchEricaOnYoutube\n#WatchEricaOnYoutube https://t.co/jSv54rTzcx",retweet,WatchEricaOnYoutube WatchEricaOnYoutube,4,False,16,1322512992769028096,"[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]","[(@jack, NN), (please, NN), (bring, VB), (@verified, VBN), (back, RB), (erica, JJ), (worth, NN), (#watchericaonyoutube, NNP), (#watchericaonyoutube, NN)]","[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]",@jack please bring @verified back erica worth #watchericaonyoutube #watchericaonyoutube,4,0,2
6,1347439335407235073,2021-01-08 07:05:46+00:00,Candice__Green,"@Jack please bring the @verified back, Erica is worth it.\n#WatchEricaOnYoutube\n#WatchEricaOnYoutube https://t.co/jSv54rTzcx",retweet,WatchEricaOnYoutube WatchEricaOnYoutube,4,False,16,979406523267985411,"[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]","[(@jack, NN), (please, NN), (bring, VB), (@verified, VBN), (back, RB), (erica, JJ), (worth, NN), (#watchericaonyoutube, NNP), (#watchericaonyoutube, NN)]","[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]",@jack please bring @verified back erica worth #watchericaonyoutube #watchericaonyoutube,4,0,2
7,1347439190196314114,2021-01-08 07:05:11+00:00,SrslySteve,"@jack You have more in common with the Cuecat guy than you do with Woz, and you know that. Hope you sleep well on your mountain of gold.",reply,,0,,0,4707712217,"[@jack, common, cuecat, guy, woz, know, hope, sleep, well, mountain, gold]","[(@jack, NN), (common, JJ), (cuecat, NN), (guy, NN), (woz, NN), (know, VBP), (hope, NN), (sleep, RB), (well, RB), (mountain, VB), (gold, NN)]","[@jack, common, cuecat, guy, woz, know, hope, sleep, well, mountain, gold]",@jack common cuecat guy woz know hope sleep well mountain gold,1,5,0
8,1347438729552560128,2021-01-08 07:03:21+00:00,91011_gray,@jack God sees your effort and work in changing people's lives with your help. I'm an orphan that is working hard to make a difference in my family and society at large but I absolutely need help. meletusejike12@gmail.com.,retweet,,1,,1,2169363216,"[@jack, god, sees, effort, work, changing, people's, lives, help, i'm, orphan, working, hard, make, difference, family, society, large, absolutely, need, help, meletusejike@gmail.com]","[(@jack, NN), (god, NN), (sees, VBZ), (effort, NN), (work, NN), (changing, VBG), (people's, NN), (lives, NNS), (help, VBP), (i'm, VB), (orphan, JJ), (working, VBG), (hard, JJ), (make, NN), (difference, NN), (family, NN), (society, NN), (large, JJ), (absolutely, RB), (need, VBP), (help, NN), (meletusejike@gmail.com, VB)]","[@jack, god, see, effort, work, changing, people's, life, help, i'm, orphan, working, hard, make, difference, family, society, large, absolutely, need, help, meletusejike@gmail.com]",@jack god see effort work changing people's life help i'm orphan working hard make difference family society large absolutely need help meletusejike@gmail.com,3,3,0
9,1347438417081102336,2021-01-08 07:02:07+00:00,91011_gray,@jack Amazing and really feel that the #MotherlandMission talk in CH pointed to a need for supporting basic needs support in places where tech ecosystem is built. I suck at tweeting well 😇 but I’ll try to tweet the orgs doing this during week,retweet,MotherlandMission,1,,1,2169363216,"[@jack, amazing, really, feel, #motherlandmission, talk, ch, pointed, need, supporting, basic, needs, support, places, tech, ecosystem, built, suck, tweeting, well, try, tweet, orgs, week]","[(@jack, NN), (amazing, VBG), (really, RB), (feel, VB), (#motherlandmission, NN), (talk, NN), (ch, NN), (pointed, VBD), (need, MD), (supporting, VBG), (basic, JJ), (needs, NNS), (support, NN), (places, NNS), (tech, VBP), (ecosystem, NN), (built, VBN), (suck, JJ), (tweeting, NN), (well, RB), (try, VB), (tweet, NN), (orgs, IN), (week, NN)]","[@jack, amazing, really, feel, #motherlandmission, talk, ch, pointed, need, supporting, basic, need, support, place, tech, ecosystem, built, suck, tweeting, well, try, tweet, orgs, week]",@jack amazing really feel #motherlandmission talk ch pointed need supporting basic need support place tech ecosystem built suck tweeting well try tweet orgs week,1,4,0
10,1347438167465615360,2021-01-08 07:01:07+00:00,Justice67662170,@jack time for you to act and be responsible. https://t.co/2miuioPkeL,quote,,0,False,0,964590993336164355,"[@jack, time, act, responsible]","[(@jack, NN), (time, NN), (act, NN), (responsible, JJ)]","[@jack, time, act, responsible]",@jack time act responsible,2,0,1
11,1347437779429453824,2021-01-08 06:59:35+00:00,Nkiruka70920363,"@Jack please bring the @verified back, Erica is worth it.\n#WatchEricaOnYoutube\n#WatchEricaOnYoutube https://t.co/jSv54rTzcx",retweet,WatchEricaOnYoutube WatchEricaOnYoutube,4,False,16,1315258533630410753,"[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]","[(@jack, NN), (please, NN), (bring, VB), (@verified, VBN), (back, RB), (erica, JJ), (worth, NN), (#watchericaonyoutube, NNP), (#watchericaonyoutube, NN)]","[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]",@jack please bring @verified back erica worth #watchericaonyoutube #watchericaonyoutube,4,0,2
12,1347437582800592898,2021-01-08 06:58:48+00:00,mpho_jacqueline,"@Jack please bring the @verified back, Erica is worth it.\n#WatchEricaOnYoutube\n#WatchEricaOnYoutube https://t.co/jSv54rTzcx",retweet,WatchEricaOnYoutube WatchEricaOnYoutube,4,False,16,1292454108549570560,"[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]","[(@jack, NN), (please, NN), (bring, VB), (@verified, VBN), (back, RB), (erica, JJ), (worth, NN), (#watchericaonyoutube, NNP), (#watchericaonyoutube, NN)]","[@jack, please, bring, @verified, back, erica, worth, #watchericaonyoutube, #watchericaonyoutube]",@jack please bring @verified back erica worth #watchericaonyoutube #watchericaonyoutube,4,0,2


In [35]:
df.columns

Index(['id', 'parsed_created_at', 'user_screen_name', 'text', 'tweet_type',
       'hashtags', 'favorite_count', 'possibly_sensitive', 'retweet_count',
       'user_id', 'processed_text', 'pos_tagged', 'lemmatized', 'final_docs',
       'NMF_topic', 'LDA1_topic', 'LDA2_topic'],
      dtype='object')

In [36]:
# drop unnecessary columns
df = df.drop(['id', 'parsed_created_at', 'user_screen_name', 'text',
       'hashtags', 'user_id', 'processed_text', 'pos_tagged', 'lemmatized', 'final_docs'], axis=1)

In [37]:
df.head(20)

Unnamed: 0,tweet_type,favorite_count,possibly_sensitive,retweet_count,NMF_topic,LDA1_topic,LDA2_topic
0,retweet,4,True,16,4,0,2
1,retweet,15,,11,4,2,3
2,retweet,4,False,16,4,0,2
6,retweet,4,False,16,4,0,2
7,reply,0,,0,1,5,0
8,retweet,1,,1,3,3,0
9,retweet,1,,1,1,4,0
10,quote,0,False,0,2,0,1
11,retweet,4,False,16,4,0,2
12,retweet,4,False,16,4,0,2


In [39]:
df.columns

Index(['tweet_type', 'favorite_count', 'possibly_sensitive', 'retweet_count',
       'NMF_topic', 'LDA1_topic', 'LDA2_topic'],
      dtype='object')