In [1]:
# NLP
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
import nltk.corpus
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Plotting
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Utilites
import re
import pandas as pd
import string

# APIs
import pymongo
import tweepy

# Scikit-Learn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
mongo = "mongodb://localhost:27017/"
twitter = "AAAAAAAAAAAAAAAAAAAAAEy0dgEAAAAABmdX43JXQabHK7JN9DytAkFbQXs%3DVopruHesemfSWbQa3WMU1hORYsikSYFFBUUzLY4ul1heniWCum"

dbclient = pymongo.MongoClient(mongo) # Initiate connection to MongoDB
api = tweepy.Client(bearer_token=twitter, wait_on_rate_limit=True, return_type=[dict]) # Initiate tweepy client

pFollower = dbclient.sna_database.primaryFollowers
sFollower = dbclient.sna_database.secondaryFollowers
sFollowing = dbclient.sna_database.secondaryFollowing
combined = dbclient.sna_database.combined
sna_database = dbclient.sna_database
df_pFollower = pd.DataFrame(list(pFollower.find()))
df_sFollower = pd.DataFrame(list(sFollower.find().sort("_id",1)))
df_sFollowing = pd.DataFrame(list(sFollowing.find().sort("_id",1)))

In [None]:
# Method for cleaning up text, making it easier for NLP
def nlp_preprocessing(group,dataset,stemOutputPath,lemmaOutputPath):
    # get the text from group where the description is not empty
    
    list_temp = list(dataset['description'])
    stop = stopwords.words('english')

    for i in range(len(list_temp)):
        # normalize text
        list_temp[i] = list_temp[i].lower()
            
        # remove unicode characters, mentions, and retweets
        list_temp[i] = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" + "\\n", " ", list_temp[i])

        # remove stopwords
        list_temp[i] = " ".join([word for word in list_temp[i].split() if word not in (stop)])
    
    # we need to iterate back through the list and remove any empty strings
    text = [item for item in list_temp if item != "" if item != "n/a"]
    
    # stemming
    text_stem = text
    stemmer = PorterStemmer()

    for s in range(len(text_stem)):
        text_stem[s] = stemmer.stem(text_stem[s])

    # lemmatization
    text_lemma = text
    lemmatizer = WordNetLemmatizer()

    for l in range(len(text_lemma)):
        text_lemma[l] = lemmatizer.lemmatize(text_lemma[l])
    
    # output the stemmed and lemmatized texts to csv for storage
    stem_series = pd.Series(text_stem)
    stem_series.to_csv(stemOutputPath,index=False)

    lemma_series = pd.Series(text_lemma)
    lemma_series.to_csv(lemmaOutputPath,index=False)

    return print("Complete Group " + str(group))

# Method for creating the wordcloud from the results of the nlp_preprocessing()
def wordcloud_vis(lemmaInput,stemInput,lemmaOutput,stemOutput):
    stop = stopwords.words('english')
    stop.extend(['missoula','montana','montanan','mt'])

    text_lemma = " ".join(lemmaInput)
    wordcloud = WordCloud(stopwords=stop,width=1600,height=1600).generate(text_lemma)

    plt.axis("off")
    wordcloud.to_file(lemmaOutput)

    text_stem = " ".join(stemInput)
    wordcloud = WordCloud(stopwords=stop,width=1600,height=1600).generate(text_stem)

    plt.axis("off")
    wordcloud.to_file(stemOutput)

## Secondary Followers

In [None]:
# NLP of descriptions filtered by group

for i in range(6):
    print("Preprocessing")
    stemToPath = "sFollowers/descriptions/stemmed/group%s.csv" % i
    lemmaToPath = "sFollowers/descriptions/lemmatized/group%s.csv" % i
    query = {'group': i, 'description':{'$ne':''}}
    temp = pd.DataFrame(list(sFollower.find(query)))
    nlp_preprocessing(i,temp,stemToPath,lemmaToPath)

    print("wordcloud Generation")
    lemmaWordcloudOutput = "dataVis/sFollowers/wordcloud/lemmatized/group%s.png" % i
    temp_lemma = pd.read_csv("sFollowers/descriptions/lemmatized/group%s.csv" % i)
    lemma = list(temp_lemma['0'])

    stemWordcloudOutput = "dataVis/sFollowers/wordcloud/stemmed/group%s.png" % i
    temp_stem = pd.read_csv("sFollowers/descriptions/stemmed/group%s.csv" % i)
    stem = list(temp_stem['0'])
    wordcloud_vis(lemma,stem,lemmaWordcloudOutput,stemWordcloudOutput)

## Sentiment Analysis of the Influencers
First, we will analyze the influencer of the combined dataset: ryanpcooney. In order to analyze the tweets, we must first procure the tweets.

In [None]:
# Analyzing ryanpcooney
tweets_text = []
tweets_text_list = []
tweets_id = []
tweets_id_list = []

for response in tweepy.Paginator(api.get_users_tweets, id=50103302, max_results = 100, limit=100, exclude="retweets"):
    print(response)
    for tweet in response.data:
        print(tweet)
        tweets_id.append(tweet.id)
        tweets_text.append(tweet.text)

tweets_id_list.append(tweets_id)
tweets_text_list.append(tweets_text)
df_tweets = pd.DataFrame(columns=["id","text"])
df_tweets["text"] = tweets_text_list[0]
df_tweets["id"] = tweets_id_list[0]

df_tweets.to_csv("influencers/combined/ryanpcooney/tweets.csv",index=False)

In [28]:
# clean and remove emojis
RE_EMOJI = re.compile(
    "["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "]+"
    )

def cleaning_emojis(text):
    return RE_EMOJI.sub(r'',text)
df_tweets["text"] = df_tweets["text"].apply(lambda x: cleaning_emojis(x))
df_tweets["text"].to_csv("test.csv",index=False)

In [29]:
# clean and remove line breaks
def cleaning_linebreaks(text):
    return re.sub(r'\n',' ',text)
df_tweets["text"] = df_tweets["text"].apply(lambda x: cleaning_linebreaks(x))
df_tweets["text"].to_csv("test.csv",index=False)

In [None]:
# cleaning and removing retweets
def cleaning_retweets(text):
    return re.sub(r'(RT.+)',' ',text)
tweets["text"] = tweets["text"].apply(lambda x: cleaning_retweets(x))
tweets.drop_duplicates(subset="text",inplace=True)
tweets["text"].to_csv("test.csv",index=False)

In [None]:
# cleaning and removing mentions
def cleaning_mentions(text):
    return re.sub(r'(@[A-Za-z0-9_]+)',' ',text)
tweets["text"] = tweets["text"].apply(lambda x: cleaning_mentions(x))
tweets["text"].to_csv("test.csv",index=False)

In [30]:
# standardize capitalization of the tweets
df_tweets["text"] = df_tweets["text"].str.lower()
df_tweets["text"].to_csv("test.csv",index=False)

In [31]:
# clean and remove URLs
def cleaning_URLs(text):
    return re.sub('(www.[^s]+)|(https?://[^s]+)|(`)',' ',text)
df_tweets["text"] = df_tweets["text"].apply(lambda x: cleaning_URLs(x))
df_tweets["text"].to_csv("test.csv",index=False)

In [32]:
# cleaning and removing punctuations
english_punctuation = string.punctuation
punctuations_list = english_punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('','',english_punctuation)
    return text.translate(translator)
df_tweets["text"] = df_tweets["text"].apply(lambda x: cleaning_punctuations(x))
df_tweets["text"] = df_tweets["text"].apply(lambda x: re.sub(r'(’)|(”)|(“)|(")','',x))
df_tweets["text"].to_csv("test.csv",index=False)

In [33]:
# remove stopwords
stop = stopwords.words('english')
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop])
df_tweets["text"] = df_tweets["text"].apply(lambda text: cleaning_stopwords(text))
df_tweets["text"].to_csv("test.csv",index=False)

In [34]:
df_tweets.drop_duplicates(subset="text",inplace=True)
df_tweets["text"].to_csv("test.csv",index=False,header=False)

In [35]:
# stem the words
stemmer = PorterStemmer()
text_stem = list(df_tweets["text"])

for s in range(len(text_stem)):
    text_stem[s] = stemmer.stem(text_stem[s])
stem_series = pd.Series(text_stem)
stem_series.to_csv("influencers/combined/ryanpcooney/tweets_stem.csv",header=False,index=False)

In [36]:
# lemmatize the words
lemmatizer = WordNetLemmatizer()
text_lemma = list(df_tweets["text"])

for l in range(len(text_lemma)):
    text_lemma[l] = lemmatizer.lemmatize(text_lemma[l])
lemma_series = pd.Series(text_lemma)
lemma_series.to_csv("influencers/combined/ryanpcooney/tweets_lemma.csv",header=False,index=False)

In [37]:
# generate a wordcloud from the 
stop = stopwords.words('english')
wc = WordCloud(width=1600,height=800).generate(" ".join(text_lemma))
wc.to_file("influencers/combined/ryanpcooney/vis/wordclound.png")

<wordcloud.wordcloud.WordCloud at 0x7f78c14144c0>

In [None]:
#TODO: Iterate through all influencers of the modularity classes and create wordclouds from the lemmatized tweets. When outputting all data, handle unmade files (make file if not exist).
#TODO: Create wordclouds for either each user, or modularity class.
#TODO: Perfect the preprocessing process (Handle unseen characters, line breaks, etc) 