# Reddit N-grams

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from nltk import ngrams
import spacy
import collections

In [2]:
nlp = spacy.load('en_core_web_lg')

#### Reading reddit dataset

In [3]:
df = pd.read_csv('../data/all/reddit_data_all.csv', index_col=0)
df.drop(0, axis=0, inplace=True)
df.head()

Unnamed: 0,id,title,score,comms_num,comments,search_topic,body,date
1,imkf0n,UBIT is safe,518,7,"['This made my day, thank you', 'COVID-19: *”g...",UBreddit,,2020-09-04 17:34:30
2,ijhhd5,WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...,472,16,['It’s not a new school week until Mr Krabs sa...,UBreddit,,2020-08-30 18:17:02
3,f8019t,What the last 2 minutes of lecture looks like ...,472,21,"['my dude how long did this take lol', 'Is tha...",UBreddit,,2020-02-22 22:30:28
4,dzmrbc,The SU Bull today,444,45,['FYI: Do not malign Chinese students holding ...,UBreddit,,2019-11-21 17:35:56
5,ex5bsz,Logging into MyUB be like.,442,27,"[""Since this semester started, it's been havin...",UBreddit,,2020-02-01 12:15:53


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316 entries, 1 to 412
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            316 non-null    object
 1   title         316 non-null    object
 2   score         316 non-null    int64 
 3   comms_num     316 non-null    int64 
 4   comments      316 non-null    object
 5   search_topic  316 non-null    object
 6   body          138 non-null    object
 7   date          316 non-null    object
dtypes: int64(2), object(6)
memory usage: 22.2+ KB


#### Checking for reddit posting year from date column

In [5]:
years={}
for date in df['date']:
    year = date.split(' ')[0].split('-')[0]
    month = date.split(' ')[0].split('-')[1]
    day = date.split(' ')[0].split('-')[2]
    if year in years.keys():
        years[year]=years[year]+1
    else:
        years[year]=1

In [6]:
years

{'2020': 74,
 '2019': 26,
 '2022': 126,
 '2021': 48,
 '2018': 22,
 '2017': 5,
 '2014': 5,
 '2016': 2,
 '2015': 5,
 '2012': 1,
 '2013': 2}

#### Combining comments, titles and body into one dataframe

In [7]:
df2 = df[['title','comments','body']]

In [8]:
df2.isna().sum()

title         0
comments      0
body        178
dtype: int64

In [9]:
df2['comments'][1]

"['This made my day, thank you', 'COVID-19: *”goddamn it I just verified an hour ago”*', '*Set to remind me in seven days.*', 'Alright this is some actual hilarious content, thanks.', 'Really don’t think it works that way... but thanks for the chuckle.']"

In [10]:
rows = [row.split("\', \'") for row in df2.comments]

In [11]:
comments = pd.Series([line for row in rows for line in row if line != '[]'])
len(comments)

1521

In [12]:
body = df.body.dropna()
len(body)

138

In [13]:
titles = pd.Series(df.title)

In [14]:
titles

1                                           UBIT is safe
2      WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...
3      What the last 2 minutes of lecture looks like ...
4                                      The SU Bull today
5                             Logging into MyUB be like.
                             ...                        
408    University at Buffalo | Niagara University | M...
409                             Class with Zoe Hamstead?
410                                 lets revive this sub
411                        University At Buffalo! (SUNY)
412                                               HELLO!
Name: title, Length: 316, dtype: object

In [15]:
body

17     CCP trolls have infiltrated UBReddit and are d...
29     Hey everyone, I’m a grad student and a TA at U...
99         That was an absolute blowout! Let's go Bulls!
100    The current one is archived for the time being...
101    As we near the second half of the semester, it...
                             ...                        
407    Hi, I have Clyde Herreid for Bio 200 and Gersh...
409    Hey has anyone taken END302 Sustainable Urban ...
410    I know what you are all saying; this sub isn't...
411    Wohoo!! Lets go class of 2018!\n\nThis subredd...
412                        Looking forward to attend UB!
Name: body, Length: 138, dtype: object

In [16]:
comments

0                           ['This made my day, thank you
1       COVID-19: *”goddamn it I just verified an hour...
2                       *Set to remind me in seven days.*
3       Alright this is some actual hilarious content,...
4       Really don’t think it works that way... but th...
                              ...                        
1516                                                ['Sup
1517                                     Honestly a lot']
1518                                        ['[deleted]']
1519    ['Just hang out on /r/SUNYBuffalo instead.\n\n...
1520    ["That's because everyone is at /r/ubreddit\n\...
Length: 1521, dtype: object

In [17]:
combined = pd.concat([titles, body, comments])

In [18]:
combined_df = pd.DataFrame(combined)
combined_df = combined_df.rename(columns={0:'Text'})
combined_df.reset_index(drop=True, inplace=True)

In [19]:
combined_df

Unnamed: 0,Text
0,UBIT is safe
1,WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...
2,What the last 2 minutes of lecture looks like ...
3,The SU Bull today
4,Logging into MyUB be like.
...,...
1970,['Sup
1971,Honestly a lot']
1972,['[deleted]']
1973,['Just hang out on /r/SUNYBuffalo instead.\n\n...


#### Cleaning data for ngram generation

In [20]:
def data_cleaner(row, part):
    words = row.split(' ')
    cleaning = [word.strip('@#"*%^();><?][{}]:.&,\'') for word in words if word.startswith(('\nhttps', 'htt'))==False]
    row = ' '.join(cleaning)
    
    doc = nlp(row)
    output = [(token.lemma_).lower() for token in doc if token.is_stop == False and token.is_punct==False and token.is_space==False and token.pos_ in part and token.is_digit==False and len(token)>3]
    return output

In [21]:
combined_df['Cleaned'] = combined_df['Text'].apply(lambda x: ' '.join(data_cleaner(x, ['NOUN', 'PROPN', 'ADJ'])))
combined_df

Unnamed: 0,Text,Cleaned
0,UBIT is safe,ubit safe
1,WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...,week mask post speedruns daily health check week
2,What the last 2 minutes of lecture looks like ...,minute lecture professor
3,The SU Bull today,bull today
4,Logging into MyUB be like.,myub
...,...,...
1970,['Sup,
1971,Honestly a lot'],
1972,['[deleted]'],
1973,['Just hang out on /r/SUNYBuffalo instead.\n\n...,instead.\n\nor suny


#### Creating ngrams columns in the dataframe

In [22]:
def generate_ngrams(df, n, text_column, column_name):
    def row_ngrams(text, n):
        tokenize = text.split()
        esgrams = ngrams(tokenize, n)
        ans = collections.Counter(esgrams)
        return ans
    df[column_name] = df[text_column].apply(lambda x: row_ngrams(x,n))

In [23]:
generate_ngrams(combined_df, 2, 'Cleaned', 'Bigrams')
generate_ngrams(combined_df, 3, 'Cleaned', 'Trigrams')
generate_ngrams(combined_df, 4, 'Cleaned', 'Quadgrams')

In [24]:
combined_df

Unnamed: 0,Text,Cleaned,Bigrams,Trigrams,Quadgrams
0,UBIT is safe,ubit safe,"{('ubit', 'safe'): 1}",{},{}
1,WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...,week mask post speedruns daily health check week,"{('week', 'mask'): 1, ('mask', 'post'): 1, ('p...","{('week', 'mask', 'post'): 1, ('mask', 'post',...","{('week', 'mask', 'post', 'speedruns'): 1, ('m..."
2,What the last 2 minutes of lecture looks like ...,minute lecture professor,"{('minute', 'lecture'): 1, ('lecture', 'profes...","{('minute', 'lecture', 'professor'): 1}",{}
3,The SU Bull today,bull today,"{('bull', 'today'): 1}",{},{}
4,Logging into MyUB be like.,myub,{},{},{}
...,...,...,...,...,...
1970,['Sup,,{},{},{}
1971,Honestly a lot'],,{},{},{}
1972,['[deleted]'],,{},{},{}
1973,['Just hang out on /r/SUNYBuffalo instead.\n\n...,instead.\n\nor suny,"{('instead.\n\nor', 'suny'): 1}",{},{}


#### Generating corpus for top ngrams

In [25]:
def generate_corpus(df, column):
    corpus = dict()

    for row in df[column]:
        for key in row.keys():
            if key in corpus.keys():
                corpus[key] += 1
            else:
                corpus[key] = 1
    return corpus

In [26]:
bigram_corpus = generate_corpus(combined_df, 'Bigrams')
trigram_corpus = generate_corpus(combined_df, 'Trigrams')
quadgram_corpus = generate_corpus(combined_df, 'Quadgrams')

#### Displaying top ngrams

In [27]:
def get_top_n(d, n):
    def keyfunction(k):
        return d[k]

    for key in sorted(d, key=keyfunction, reverse=True)[:n]:
        print ("%s: %i" % (key, d[key]))
    

#### Bi-Grams

In [28]:
get_top_n(bigram_corpus, 10)

('north', 'campus'): 25
('south', 'campus'): 24
('rent', 'month'): 16
('spring', 'break'): 12
('good', 'luck'): 10
('university', 'buffalo'): 10
('grad', 'student'): 10
('living', 'room'): 10
('private', 'bathroom'): 10
('station', 'buffalo'): 10


#### Tri-Grams

In [29]:
get_top_n(trigram_corpus, 10)

('kitchen', 'living', 'room'): 8
('bedroom', 'private', 'bathroom'): 7
('rent', 'month', 'utility'): 6
('month', 'june', 'july'): 5
('lease', 'bedroom', 'private'): 5
('private', 'bathroom', 'laundry'): 5
('bathroom', 'laundry', 'unit'): 5
('laundry', 'unit', 'kitchen'): 5
('unit', 'kitchen', 'living'): 5
('living', 'room', 'station'): 5


#### Quad-Grams

In [30]:
get_top_n(quadgram_corpus, 10)

('lease', 'bedroom', 'private', 'bathroom'): 5
('bedroom', 'private', 'bathroom', 'laundry'): 5
('private', 'bathroom', 'laundry', 'unit'): 5
('bathroom', 'laundry', 'unit', 'kitchen'): 5
('laundry', 'unit', 'kitchen', 'living'): 5
('unit', 'kitchen', 'living', 'room'): 5
('kitchen', 'living', 'room', 'station'): 5
('living', 'room', 'station', 'buffalo'): 5
('room', 'station', 'buffalo', 'rent'): 5
('station', 'buffalo', 'rent', 'month'): 5
