In [None]:
### Installations

%pip install pandas==1.1.5
%pip install numpy

In [None]:
### Further Installations

%pip install pyLDAvis
%pip install pyLDAvis.gensim
%pip install bokeh
%pip install gensim
%pip install spacy
%pip install logging
%pip install wordcloud
%pip install nltk
%pip install -U pip setuptools wheel
%pip install -U spacy
%python -m spacy download en_core_web_sm
%pip install -U seaborn
%pip install translators

In [3]:
### Imports
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
import sys
import re
import numpy as np
import requests
import xml
import xml.etree.ElementTree as ET 
import logging, warnings
import spacy, gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from requests.utils import requote_uri
from requests.utils import requote_uri
from pprint import pprint


In [4]:
def splice(doc):
    counts = 'numChars numCharsMinusSpacesAndPunctuation numWords numSentences numPunctuation'
    speech = 'numNouns nounRatio numVerbs verbRatio numAdjectives adjectiveRatio numAdverbs adverbRatio'
    immediacy = 'numPassiveVerbs passiveVerbRatio'
    pronouns = 'firstPersonSingular firstPersonSingularRatio firstPersonPlural firstPersonPluralRatio secondPerson secondPersonRatio thirdPersonSingular thirdPersonSingularRatio thirdPersonPlural thirdPersonPluralRatio'
    positive_self = 'iCanDoIt doKnow posSelfImage'
    negative_self = 'iCantDoIt dontKnow negSelfImage'
    influence = 'numImperatives suggestionPhrases inflexibility contradict totalDominance dominanceRatio numAgreement agreementRatio'
    deference = "askPermission seekGuidance totalSubmissiveness submissivenessRatio"
    affect = "Imagery Pleasantness Activation"
    complexity = "avgWordLength avgSentenceLength numSyllables avgSyllablesPerWord numWordsWith3OrMoreSyllables rateWordsWith3OrMoreSyllables numWordsWith6OrMoreChars rateWordsWith6OrMoreChars numWordsWith7OrMoreChars rateWordsWith7OrMoreChars LexicalDiversity complexityComposite"
    style = "hedgeVerb hedgeConj hedgeAdj hedgeModal hedgeAll numDisfluencies disfluencyRatio numInterjections interjectionRatio numSpeculate speculateRatio Expressivity numIgnorance ignoranceRatio Pausality questionCount questionRatio hedgeUncertain"
    tense = 'pastTense pastTenseRatio presentTense presentTenseRatio'
    sentiment = "SWNpositivity SWNnegativity SWNobjectivity"
    readability = "ARI FRE FKGL CLI LWRF FOG SMOG DALE LIX RIX FRY"

    url = f"http://splice.cmi.arizona.edu/SPLICE/{doc}/{counts} {speech} {immediacy} {pronouns} {influence} {deference} {affect} {complexity} {style} {tense} {sentiment} {readability}"
    url_encoded = requote_uri(url)
    req = requests.request("GET", url_encoded)

    results = []
    root = ET.fromstring(req.text)
    for child in list(root.getchildren()):
        results.append(list([child.tag, child.text]))

    return results
      

In [10]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

In [16]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 'co'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and perform Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
        
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])    # Load spacy, but we don't need the parser or NER (named entity extraction) modules
        
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

In [5]:
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

df = pd.read_csv("noReplies.csv")
df.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0.1,Unnamed: 0,id,handle,followers,text,lang,possibly_sensitive,retweet_count,reply_count,like_count,...,contains_quote,is_reply,num_referenced_tweets,url_image,num_hashtags,text_first_hashtag,num_mentions,num_cashtags,num_polls,interaction_score
0,0,1470814862871777287,Schlotzskys,19593,Challenging you to not bite your phone. https:...,en,False,2,5,29,...,False,False,0,,0,,0,0,0,0.00171
1,1,1470095650150928385,Schlotzskys,19593,VOILA—this secret Santa is bringing Schlotzsky...,en,False,2,0,1,...,False,False,0,,0,,0,0,0,0.000332
2,2,1469339638334697475,Schlotzskys,19593,"No, no, no...you stay where you are. We'll com...",en,False,1,5,15,...,False,False,0,,0,,0,0,0,0.000791
3,3,1469032106068254736,Schlotzskys,19593,Tag your better half. #SharingIsCaring https:/...,en,False,2,5,34,...,False,False,0,,1,SharingIsCaring,0,0,0,0.001837
4,4,1467906260418895881,Schlotzskys,19593,No holiday spread is complete without Cinnabon...,en,False,2,0,4,...,False,False,0,,0,,0,0,0,0.000408


In [6]:
df['sentiment_overall'] = 0.0
df['sentiment_neg'] = 0.0
df['sentiment_neu'] = 0.0
df['sentiment_pos'] = 0.0

for row in df.itertuples():
    sentiment = sia.polarity_scores(row[5])
    df.loc[row[0], 'sentiment_overall'] = sentiment['compound']
    df.loc[row[0], 'sentiment_neg'] = sentiment['neg']
    df.loc[row[0], 'sentiment_neu'] = sentiment['neu']
    df.loc[row[0], 'sentiment_pos'] = sentiment['pos']

In [11]:
data = df.text.values.tolist()
data_words = list(sent_to_words(data))
for tweet in data_words[:5]: # print the first :n tweet word lists
    print(tweet)

['challenging', 'you', 'to', 'not', 'bite', 'your', 'phone', 'https', 'co', 'rcbzkwj']
['voila', 'this', 'secret', 'santa', 'is', 'bringing', 'schlotzskys', 'gift', 'cards', 'to', 'the', 'party', 'https', 'co', 'ke', 'hqp', 'gej']
['no', 'no', 'no', 'you', 'stay', 'where', 'you', 'are', 'well', 'come', 'to', 'you', 'free', 'delivery', 'on', 'saturday', 'and', 'sunday', 'https', 'co', 'lnazsp', 'wfn']
['tag', 'your', 'better', 'half', 'sharingiscaring', 'https', 'co', 'nvvreuic']
['no', 'holiday', 'spread', 'is', 'complete', 'without', 'cinnabon', 'add', 'to', 'your', 'catering', 'order', 'today', 'https', 'co', 'crbazf']


In [None]:
length = ''
for i in range(len(data_words)):
    for j in range(len(data_words[i])):
        length += data_words[i][j]
print(f'Corpus size: {str(len(length))}')

In [12]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [18]:
data_ready = process_words(data_words)  # processed Text Data!
for tweet in data_ready[:5]:
    print(tweet)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.