In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Read in data created from Data Retrieval

In [86]:
#TODO: Import existing csvs from datasets
futurology = pd.read_csv('../datasets/futurology_2023-10-17_with_links.csv')
scifi = pd.read_csv('../datasets/scifi_2023-10-17_with_links.csv')

In [87]:
futurology.set_index('id', inplace = True)

In [88]:
scifi.set_index('id', inplace = True)

In [89]:
reddit_data_df = pd.concat([futurology, scifi])

In [90]:
reddit_data_df.head()

Unnamed: 0_level_0,type,title,self_text,subreddit,upvote_ratio,link_address,user,datetime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15wi75l,hot,r/futurology is now in the fediverse at - http...,https://futurology.today\n\nWe’ve had a Discor...,Futurology,0.9,https://www.reddit.com/r/Futurology/comments/1...,FuturologyModTeam,2023-08-20 14:00:44
17a1cd6,hot,SpaceX says the US Federal Aviation Authority'...,,Futurology,0.89,https://arstechnica.com/space/2023/10/citing-s...,lughnasadh,2023-10-17 11:44:21
179wk9u,hot,Over 1 million satellites could be headed to E...,,Futurology,0.93,https://www.space.com/million-satellites-conge...,LiveScience_,2023-10-17 07:51:27
179pcbt,hot,Marc Andreessen just dropped a ‘Techno-Optimis...,,Futurology,0.76,https://fortune.com/2023/10/16/marc-andreessen...,JamesTiberiusKirque,2023-10-16 23:47:22
17a1mue,hot,Will human languages be more unified or divers...,"Direction A: For example, designing a universa...",Futurology,0.84,https://www.reddit.com/r/Futurology/comments/1...,Ghenym,2023-10-17 11:57:22


In [91]:
reddit_data_df.shape

(3618, 8)

In [92]:
reddit_data_df.isnull().sum()

type               0
title              0
self_text       2785
subreddit          0
upvote_ratio       0
link_address       0
user             151
datetime           0
dtype: int64

In [93]:
def URL_tokenize(url):
    text = url
    text = text.replace('https:', '')
    text = text.replace('http:','')
    text = text.replace('&amp', '&')
    text = text.replace('&nbsp', ' ')
    text = text.replace('?', ' ')
    text = text.replace('//', ' ')
    text = text.replace('/', ' ')
    text = text.replace('-', ' ')
    text = text.replace('_', ' ')
    text = text.split()
    text = ' '.join(text[1:])
    return text

In [94]:
def extract_domain(url):
    text = url
    text = text.replace('https:', '')
    text = text.replace('http:','')
    text = text.replace('www.', '')
    text = text.replace('/', ' ')
    return text.split()[0]

In [95]:
print(extract_domain('http://www.google.com/hello'))

google.com


In [96]:
print(make_URLs_usable('http://www.google.com/hello-world/good&ampstuff'))

hello world good&stuff


In [97]:
reddit_data_df['url_content'] = reddit_data_df['link_address'].apply(lambda url: URL_tokenize(url))

In [98]:
reddit_data_df['url_domain'] = reddit_data_df['link_address'].apply(lambda url: extract_domain(url))

In [99]:
reddit_data_df.loc[: , ['url_domain', 'url_content']].isnull().sum()

url_domain     0
url_content    0
dtype: int64

In [100]:
X = reddit_data_df.loc[:, ['title', 'url_domain', 'url_content']]
y = reddit_data_df['subreddit']

In [101]:
X.head()

Unnamed: 0_level_0,title,url_domain,url_content
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15wi75l,r/futurology is now in the fediverse at - http...,reddit.com,r Futurology comments 15wi75l rfuturology is n...
17a1cd6,SpaceX says the US Federal Aviation Authority'...,arstechnica.com,space 2023 10 citing slow starship reviews spa...
179wk9u,Over 1 million satellites could be headed to E...,space.com,million satellites congest low earth orbit stu...
179pcbt,Marc Andreessen just dropped a ‘Techno-Optimis...,fortune.com,2023 10 16 marc andreessen techno optimist man...
17a1mue,Will human languages be more unified or divers...,reddit.com,r Futurology comments 17a1mue will human langu...


In [104]:
ps = PorterStemmer()
wn = WordNetLemmatizer()

In [102]:
## custom Tim funcion for mapping nltk POS_TAG for wordnet lemmatizing
def custom_lemmatize(word, tag):
    mapper = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    pos = mapper.get(tag[0])
    
    return wn.lemmatize(word, pos) if pos else word

### Stem

In [110]:
# reddit_data_df['title'].apply(lambda title: [ps.stem(word) for word in title.split()])

In [107]:
ps.stem('dropped')

'drop'

### Lemmatize

In [114]:
reddit_data_df['title'].apply(lambda title : ' '.join([custom_lemmatize(word,tag) for word, tag in nltk.pos_tag(title.split())]))

id
15wi75l    r/futurology be now in the fediverse at - http...
17a1cd6    SpaceX say the US Federal Aviation Authority's...
179wk9u    Over 1 million satellite could be head to Eart...
179pcbt    Marc Andreessen just drop a ‘Techno-Optimist M...
17a1mue    Will human language be more unified or diverse...
                                 ...                        
erjs4l     Why Hulu Picked Up Seth MacFarlane's The Orvil...
3scmoj               A beautiful story about someone we love
gfu65e     Tom Cruise will work with NASA on first movie ...
1lf3oo     Ever wonder what Picard's crew would look like...
3z8zll      Boba Fett Actor Jason Wingreen Dies : People.com
Name: title, Length: 3618, dtype: object