Reading with pandas

In [1]:
import pandas as pd
data = pd.read_csv("file_name.csv")

In [2]:
# word tokenize each column of interest from the df
import nltk
from nltk import word_tokenize, sent_tokenize

In [3]:
data['Date Created'] = pd.to_datetime(data['Date Created'],unit='s')

In [75]:
data.columns

Index(['Title', 'Political Lean', 'Score', 'Id', 'Subreddit', 'URL',
       'Num of Comments', 'Text', 'Date Created'],
      dtype='object')

In [5]:
data.groupby(['Political Lean']).size()

Political Lean
Conservative    4535
Liberal         8319
dtype: int64

In [6]:
data.groupby(['Subreddit']).size()

Subreddit
Capitalism              975
Communist               574
DemocraticSocialism     922
Liberal                 904
Libertarian             975
RadicalFeminism         100
SocialDemocracy         997
alltheleft              997
anarchocapitalism       637
conservatives          1000
democrats               941
feminisms               935
progressive             974
republicans             948
socialism               975
dtype: int64

In [7]:
liberal_df = data[data['Political Lean'] == 'Liberal']
liberal_df.groupby('Subreddit').size()
#print(len(liberal_data))

Subreddit
Communist              574
DemocraticSocialism    922
Liberal                904
RadicalFeminism        100
SocialDemocracy        997
alltheleft             997
democrats              941
feminisms              935
progressive            974
socialism              975
dtype: int64

In [8]:
conservative_df = data[data['Political Lean'] == 'Conservative']
conservative_df.groupby('Subreddit').size()

Subreddit
Capitalism            975
Libertarian           975
anarchocapitalism     637
conservatives        1000
republicans           948
dtype: int64

In [9]:
# word tokenize liberal text

from nltk import word_tokenize, sent_tokenize
liberal_data = data[data['Political Lean'] == 'Liberal']
test = liberal_data['Text']
test = test.dropna()
test

2       Who watched the state of the union last night ...
11      I have fallen for this trap several times and ...
20      One of the things I have noticed in todays wor...
42      [https://kites-journal.org/2022/03/01/between-...
54      ***"Axe tax"*** aka ***"Hammer tax"*** aka ***...
                              ...                        
8159    http://www.facebook.com/event.php?eid=27851760...
8187                                     just wonderin :p
8213    Ok to start off I'm not communist, I'm liberta...
8250    Please forward\n\n \n\nCCR EXPLAINS HLP v. HOL...
8280    I think a little materialist analysis on the c...
Name: Text, Length: 1471, dtype: object

In [10]:
t_lib = " "
for text in test:     # convert from column texts to string
    t_lib+=str(text)

In [11]:
t_lib = t_lib.lower()

In [12]:
import re
t_lib = re.sub(r'\d+', '', t_lib)

In [13]:
import string
t_lib = t_lib.translate(str.maketrans("","", string.punctuation))

In [14]:
tok = word_tokenize(t_lib)

In [15]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# add more words to stopwords
my_words=['th','would', 'https', '’', '”', '“', '’']
stop_words.extend(my_words)
tok = word_tokenize(t_lib)
result = [i for i in tok if not i in stop_words]

In [76]:
from nltk.probability import FreqDist
fdist = FreqDist(result)
fdist = fdist.most_common(10)
fdist

[('people', 1154),
 ('like', 965),
 ('one', 706),
 ('us', 664),
 ('social', 624),
 ('party', 594),
 ('workers', 566),
 ('also', 555),
 ('think', 507),
 ('even', 484)]

In [21]:
from nltk.probability import ConditionalFreqDist
cfdist = ConditionalFreqDist()

for word in word_tokenize(t_lib): 
    condition = len(word)
    cfdist[condition][word] += 1
cfdist[5]

FreqDist({'would': 1060, 'their': 1047, 'about': 1005, 'which': 741, 'there': 680, 'other': 600, 'party': 594, 'these': 516, 'think': 507, 'being': 434, ...})

In [22]:
from textblob import TextBlob
tb = TextBlob(t_lib)
print(tb.sentiment.polarity)

0.0890375085745266


In [23]:
from nltk.probability import ConditionalFreqDist
from nltk.tokenize import word_tokenize
cfdist = ConditionalFreqDist()

for word in result: 
    condition = len(word)
    cfdist[condition][word] += 1
cfdist[2]

FreqDist({'us': 664, 'im': 372, 'go': 195, 'xb': 184, 'id': 95, 'eu': 62, 'uk': 50, '•b': 41, 'ie': 40, 'na': 38, ...})

In [77]:
bgs = nltk.bigrams(result)
fdist_bgs = nltk.FreqDist(bgs)
fdist_bgs.most_common(10)

[(('working', 'class'), 161),
 (('social', 'democracy'), 132),
 (('social', 'democratic'), 86),
 (('united', 'states'), 86),
 (('social', 'democrats'), 70),
 (('soviet', 'union'), 62),
 (('democratic', 'party'), 50),
 (('means', 'production'), 48),
 (('feel', 'like'), 48),
 (('many', 'people'), 47)]

In [26]:
# parts of speech
tagged = nltk.pos_tag(result)

In [27]:
# distribution of parts of speech
fdis = FreqDist(tagged)
fdis = fdis.most_common(15)
fdis

[(('people', 'NNS'), 1154),
 (('like', 'IN'), 949),
 (('one', 'CD'), 706),
 (('us', 'PRP'), 664),
 (('social', 'JJ'), 624),
 (('party', 'NN'), 594),
 (('workers', 'NNS'), 566),
 (('also', 'RB'), 555),
 (('even', 'RB'), 483),
 (('government', 'NN'), 451),
 (('time', 'NN'), 448),
 (('think', 'VBP'), 423),
 (('state', 'NN'), 419),
 (('many', 'JJ'), 415),
 (('new', 'JJ'), 397)]

In [52]:
trig = nltk.trigrams(result)
fdistr = nltk.FreqDist(trig)
fdistr.most_common(15)

[(('tries', 'leave', 'russiancolonialism'), 25),
 (('free', 'discuss', 'whatever'), 20),
 (('discuss', 'whatever', 'please'), 20),
 (('whatever', 'please', 'socdem'), 20),
 (('please', 'socdem', 'related'), 20),
 (('socdem', 'related', 'entirely'), 20),
 (('related', 'entirely', 'unrelated'), 20),
 (('entirely', 'unrelated', 'whatever'), 20),
 (('unrelated', 'whatever', 'youd'), 20),
 (('leave', 'russiancolonialism', 'moscow'), 15),
 (('social', 'democratic', 'party'), 15),
 (('world', 'war', 'ii'), 15),
 (('social', 'democratic', 'parties'), 11),
 (('ownership', 'means', 'production'), 11),
 (('new', 'york', 'times'), 10)]

In [68]:
tag_fd = nltk.FreqDist(tag for (word, tag) in tagged)
tag_fd.most_common(20)

[('NN', 52832),
 ('JJ', 32962),
 ('NNS', 21159),
 ('RB', 10640),
 ('VBP', 9780),
 ('VBG', 7139),
 ('VBD', 6844),
 ('VBN', 3879),
 ('IN', 3171),
 ('VBZ', 3025),
 ('VB', 3021),
 ('CD', 1210),
 ('MD', 909),
 ('JJR', 695),
 ('PRP', 683),
 ('NNP', 509),
 ('JJS', 451),
 ('DT', 435),
 ('RBR', 400),
 ('FW', 218)]

In [88]:
# liberals mentioning specific words

fdist_w = FreqDist(tok)
vocab = fdist_w.keys()
fdist_w['biden']

58

END OF LIB PREPROCESSING

In [None]:
test3 = data.dropna()

In [55]:
# word tokenize conservative text
conservative_data = data[data['Political Lean'] == 'Conservative']
test = conservative_data['Text']
test3 = test.dropna()
test3

8328     Socialism  is a redistribution system not a we...
8344     1) After informing Derek about his sore stomac...
8345     What are folks thinking.\n\nI was *impressed* ...
8357                                               Curious
8359                             Holy shit. We’re screwed.
                               ...                        
12829    "Well now, which is the longest river in Afric...
12837    Let's say there are private courts/resolution ...
12838    Basically, the above. It can be a very effecti...
12842    Last week I would've considered myself a liber...
12853    I go to the mises.org and listen to the writin...
Name: Text, Length: 957, dtype: object

In [None]:
# method n2 to remove punctuation
#tok2 = tok
#tok = list(filter(lambda tok: tok not in string.punctuation, tok))
#tok2

In [33]:
t_conserv = " "
for text in test3:     # convert from column texts to string
    t_conserv+=str(text)

In [34]:
t_conserv = t_conserv.lower()

In [35]:
import re
t_conserv = re.sub(r'\d+', '', t_conserv)

In [36]:
import string
t_conserv = t_conserv.translate(str.maketrans("","", string.punctuation))

In [44]:
tokc = word_tokenize(t_conserv)

In [47]:
from nltk.corpus import stopwords
s_words = stopwords.words('english')

my_words=['th','would', 'https', '’', '”', '“', "’", '’']
s_words.extend(my_words)

tokc = word_tokenize(t_conserv)
result1 = [c for c in tokc if not c in s_words]

In [84]:
fdist1 = FreqDist(result1)
fdist1 = fdist1.most_common(10)
fdist1

[('people', 708),
 ('like', 415),
 ('government', 359),
 ('one', 320),
 ('us', 290),
 ('think', 278),
 ('get', 270),
 ('dont', 244),
 ('even', 241),
 ('capitalism', 236)]

In [85]:
bgs1 = nltk.bigrams(result1)
fdist_bgs1 = nltk.FreqDist(bgs1)
fdist_bgs1.most_common(10)

[(('free', 'market'), 56),
 (('minimum', 'wage'), 47),
 (('united', 'states'), 31),
 (('things', 'like'), 27),
 (('many', 'people'), 27),
 (('dont', 'know'), 24),
 (('seems', 'like'), 21),
 (('private', 'property'), 21),
 (('dont', 'want'), 20),
 (('middle', 'class'), 18)]

In [87]:
# conservatives mentioning capitalism 

fdist = FreqDist(tokc)
vocab = fdist.keys()
fdist['biden']

41

In [51]:
from textblob import TextBlob
tb = TextBlob(t_conserv)
print(tb.sentiment.polarity)

0.08423891682332069


looks like you processed it well
if I was trying to get more context of what they are saying
I'd look at bigrams and trygram fredist
FreqDist
(trigram) can't type today
you could take what you have an also make a text classifier with a few modifications


In [58]:
# tagged parts of speech
taggedc = nltk.pos_tag(result1)

In [59]:
fdisc = FreqDist(taggedc)
fdisc = fdisc.most_common(12)
fdisc

[(('people', 'NNS'), 708),
 (('like', 'IN'), 410),
 (('government', 'NN'), 359),
 (('one', 'CD'), 318),
 (('us', 'PRP'), 290),
 (('even', 'RB'), 240),
 (('capitalism', 'NN'), 227),
 (('money', 'NN'), 224),
 (('think', 'VBP'), 221),
 (('could', 'MD'), 215),
 (('also', 'RB'), 207),
 (('state', 'NN'), 203)]

In [60]:
# bigram

bgsc = nltk.bigrams(result1)
fdist_bgsc = nltk.FreqDist(bgsc)
fdist_bgsc.most_common(15)

[(('free', 'market'), 56),
 (('minimum', 'wage'), 47),
 (('united', 'states'), 31),
 (('things', 'like'), 27),
 (('many', 'people'), 27),
 (('dont', 'know'), 24),
 (('seems', 'like'), 21),
 (('private', 'property'), 21),
 (('dont', 'want'), 20),
 (('middle', 'class'), 18),
 (('get', 'rid'), 17),
 (('people', 'dont'), 17),
 (('goods', 'services'), 17),
 (('dont', 'think'), 16),
 (('lets', 'say'), 16)]

In [86]:
# trigram

trigc = nltk.trigrams(tokc)
fdist_trigc = nltk.FreqDist(trigc)
fdist_trigc.most_common(10)

[(('it', '’', 's'), 93),
 (('a', 'lot', 'of'), 70),
 (('i', '’', 'm'), 63),
 (('don', '’', 't'), 58),
 (('be', 'able', 'to'), 36),
 (('one', 'of', 'the'), 31),
 (('to', 'be', 'a'), 29),
 (('in', 'order', 'to'), 29),
 (('the', 'united', 'states'), 27),
 (('there', 'is', 'no'), 26)]

In [69]:
tag_fd1 = nltk.FreqDist(tag for (word, tag) in taggedc)
tag_fd1.most_common(20)

[('NN', 23615),
 ('JJ', 13781),
 ('NNS', 9004),
 ('VBP', 4629),
 ('RB', 4585),
 ('VBG', 3142),
 ('VBD', 2643),
 ('VBN', 1618),
 ('VB', 1528),
 ('VBZ', 1497),
 ('IN', 1380),
 ('CD', 603),
 ('MD', 461),
 ('JJR', 335),
 ('PRP', 301),
 ('DT', 252),
 ('JJS', 246),
 ('NNP', 235),
 ('RBR', 194),
 ('FW', 105)]