# Twitter N-grams

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from nltk import ngrams
import spacy
import collections

In [2]:
nlp = spacy.load('en_core_web_lg')

#### Reading twitter dataset

In [3]:
df_1 = pd.read_csv('../data/excel_files/combined_twitter_df.csv', index_col=0)
df_1.head()

Unnamed: 0,Tweet,Hashtag
0,@TheBuffaloNews Great to see the osteointegrat...,
1,After an @AANMember study documented the pay g...,ubuffalo
2,Join #UBGSE for Black History Nerds Saturday S...,ubgse blackhistorymonth ubuffalo
3,Michael Rembis is the director of the Center f...,
4,"Thank you, @NeurologyToday, for giving me the ...",paygaps genderinequity neurology ubuffalo wome...


In [4]:
df_1.isna().sum()

Tweet       0
Hashtag    48
dtype: int64

In [5]:
df_1['Tweet']

0      @TheBuffaloNews Great to see the osteointegrat...
1      After an @AANMember study documented the pay g...
2      Join #UBGSE for Black History Nerds Saturday S...
3      Michael Rembis is the director of the Center f...
4      Thank you, @NeurologyToday, for giving me the ...
                             ...                        
938    Get help from a professional team.\nWe guarant...
939    Get help from a professional team.\nWe guarant...
940    Securing you top grades is our top priority.\n...
941    Need help with your homework? DM us.\nGuarante...
942    A+ assured in your assignment(s).\nFor quality...
Name: Tweet, Length: 943, dtype: object

In [6]:
df_1['Hashtag']

0                                                    NaN
1                                               ubuffalo
2                       ubgse blackhistorymonth ubuffalo
3                                                    NaN
4      paygaps genderinequity neurology ubuffalo wome...
                             ...                        
938                             #universityatbuffalosuny
939                             #universityatbuffalosuny
940                             #universityatbuffalosuny
941                             #universityatbuffalosuny
942                             #universityatbuffalosuny
Name: Hashtag, Length: 943, dtype: object

#### Adding hashtags into default stopwords

In [15]:
hashtags = ['ubuffalo', 'ubtrueblue', 'ubhornsup', 'ubalumni', 'ubgse', 'ubbulls', 'ubmgt']

In [16]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

#### Reading all tweets dataset

In [17]:
df = pd.read_csv('../data/combined_files/all_tweets.csv', index_col=0)
df.head()

Unnamed: 0,Tweet
0,@TheBuffaloNews Great to see the osteointegrat...
1,After an @AANMember study documented the pay g...
2,Join #UBGSE for Black History Nerds Saturday S...
3,Michael Rembis is the director of the Center f...
4,"Thank you, @NeurologyToday, for giving me the ..."


In [18]:
ats = []
for data in df['Tweet']:
    for word in data.split(' '):
        if word.startswith('@'):
            ats.append(word.strip('@').lower())

In [19]:
custom_stopwords = ats+hashtags

In [20]:
nlp.Defaults.stop_words |= set(custom_stopwords)

In [21]:
def data_cleaner(row, part:list):
    words = row.split(' ')
    cleaned = [word.strip('@#"*%^();><?][{}]:.&,\'') for word in words if word.startswith(('\nhttps', 'htt'))==False]
    row = ' '.join(cleaned)

    
    doc = nlp(row)
    result = [(token.lemma_).lower() for token in doc if token.is_stop == False and token.is_punct==False and token.pos_ in part and token.is_digit==False and len(token.lemma_)>3]
    return result

In [22]:
df['Cleaned'] = df['Tweet'].apply(lambda x: ' '.join(data_cleaner(x, ['NOUN', 'PROPN', 'ADJ'])))
df.head()

Unnamed: 0,Tweet,Cleaned
0,@TheBuffaloNews Great to see the osteointegrat...,great osteointegrated program
1,After an @AANMember study documented the pay g...,study male female neurologist neurologist heal...
2,Join #UBGSE for Black History Nerds Saturday S...,black history nerds saturday school hill proje...
3,Michael Rembis is the director of the Center f...,michael rembis director center disability stud...
4,"Thank you, @NeurologyToday, for giving me the ...",opportunity paygaps genderinequity neurology w...


#### Generating N-Grams column in the dataframe

In [23]:
def generate_ngrams(df, n, text_column, column_name):
    def row_ngrams(text, n):
        tokenize = text.split()
        esgrams = ngrams(tokenize, n)
        ans = collections.Counter(esgrams)
        return ans
    df[column_name] = df[text_column].apply(lambda x: row_ngrams(x,n))

In [24]:
generate_ngrams(df, 2, 'Cleaned', 'Bigrams')
generate_ngrams(df, 3, 'Cleaned', 'Trigrams')
generate_ngrams(df, 4, 'Cleaned', 'Quadgrams')

In [25]:
df.head()

Unnamed: 0,Tweet,Cleaned,Bigrams,Trigrams,Quadgrams
0,@TheBuffaloNews Great to see the osteointegrat...,great osteointegrated program,"{('great', 'osteointegrated'): 1, ('osteointeg...","{('great', 'osteointegrated', 'program'): 1}",{}
1,After an @AANMember study documented the pay g...,study male female neurologist neurologist heal...,"{('study', 'male'): 1, ('male', 'female'): 1, ...","{('study', 'male', 'female'): 1, ('male', 'fem...","{('study', 'male', 'female', 'neurologist'): 1..."
2,Join #UBGSE for Black History Nerds Saturday S...,black history nerds saturday school hill proje...,"{('black', 'history'): 1, ('history', 'nerds')...","{('black', 'history', 'nerds'): 1, ('history',...","{('black', 'history', 'nerds', 'saturday'): 1,..."
3,Michael Rembis is the director of the Center f...,michael rembis director center disability stud...,"{('michael', 'rembis'): 1, ('rembis', 'directo...","{('michael', 'rembis', 'director'): 1, ('rembi...","{('michael', 'rembis', 'director', 'center'): ..."
4,"Thank you, @NeurologyToday, for giving me the ...",opportunity paygaps genderinequity neurology w...,"{('opportunity', 'paygaps'): 1, ('paygaps', 'g...","{('opportunity', 'paygaps', 'genderinequity'):...","{('opportunity', 'paygaps', 'genderinequity', ..."


#### Generating corpus for ngrams

In [26]:
def generate_corpus(df, column):
    corpus = dict()

    for row in df[column]:
        for key in row.keys():
            if key in corpus.keys():
                corpus[key] += 1
            else:
                corpus[key] = 1
    return corpus

In [27]:
bigram_corpus = generate_corpus(df, 'Bigrams')
trigram_corpus = generate_corpus(df, 'Trigrams')
quadgram_corpus = generate_corpus(df, 'Quadgrams')

#### Displaying top ngrams

In [28]:
def get_top_n(d, n):
    def keyfunction(k):
        return d[k]

    for key in sorted(d, key=keyfunction, reverse=True)[:n]:
        print ("%s: %i" % (key, d[key]))

#### Bi-Grams

In [32]:
get_top_n(bigram_corpus, 20)

('nomore', 'breakthesilence'): 78
('human', 'right'): 72
('discussion', 'tigray'): 69
('right', 'abuse'): 68
('black', 'history'): 67
('force', 'beginning'): 67
('tigray', 'human'): 66
('abuse', 'crime'): 66
('crime', 'rampant'): 66
('westerntigray', 'hand'): 66
('hand', 'ethiopian'): 66
('eritrean', 'force'): 66
('rampant', 'westerntigray'): 65
('ethiopian', 'eritrean'): 65
('beginning', 'tigraygenocide'): 62
('fighter', 'tplf'): 51
('university', 'buffalo'): 44
('alumni', 'arena'): 43
('school', 'management'): 43
('good', 'luck'): 41


#### Tri-Grams

In [33]:
get_top_n(trigram_corpus, 20)

('human', 'right', 'abuse'): 68
('discussion', 'tigray', 'human'): 66
('tigray', 'human', 'right'): 66
('right', 'abuse', 'crime'): 66
('abuse', 'crime', 'rampant'): 65
('crime', 'rampant', 'westerntigray'): 65
('rampant', 'westerntigray', 'hand'): 65
('westerntigray', 'hand', 'ethiopian'): 65
('ethiopian', 'eritrean', 'force'): 65
('eritrean', 'force', 'beginning'): 65
('hand', 'ethiopian', 'eritrean'): 64
('force', 'beginning', 'tigraygenocide'): 62
('black', 'history', 'nerds'): 33
('history', 'nerds', 'saturday'): 33
('nerds', 'saturday', 'school'): 33
('family', 'fighter', 'tplf'): 33
('graduate', 'school', 'education'): 21
('diversity', 'equity', 'inclusion'): 19
('civilian', 'percent', 'tigray'): 19
('tplf', 'terrorist', 'group'): 19


#### Quad-Grams

In [34]:
get_top_n(quadgram_corpus, 20)

('discussion', 'tigray', 'human', 'right'): 66
('tigray', 'human', 'right', 'abuse'): 66
('human', 'right', 'abuse', 'crime'): 66
('right', 'abuse', 'crime', 'rampant'): 65
('crime', 'rampant', 'westerntigray', 'hand'): 65
('abuse', 'crime', 'rampant', 'westerntigray'): 64
('rampant', 'westerntigray', 'hand', 'ethiopian'): 64
('hand', 'ethiopian', 'eritrean', 'force'): 64
('ethiopian', 'eritrean', 'force', 'beginning'): 64
('westerntigray', 'hand', 'ethiopian', 'eritrean'): 63
('eritrean', 'force', 'beginning', 'tigraygenocide'): 60
('black', 'history', 'nerds', 'saturday'): 33
('history', 'nerds', 'saturday', 'school'): 33
('center', 'black', 'history', 'racial'): 19
('black', 'history', 'racial', 'literacy'): 19
('history', 'racial', 'literacy', 'education'): 19
('tplf', 'terror', 'civilian', 'percent'): 18
('terror', 'civilian', 'percent', 'tigray'): 18
('ethiopia', 'federal', 'govt', 'nomore'): 18
('civilian', 'small', 'available', 'family'): 18
