# Data Collection

Use reddit-scrape.py

In [None]:
# Tech Data

tech_reddit = Reddit(user_name, password, client_id, secret_key, user_agent)
tech_reddit.get_posts('Electronics/Gadgets', ['tech', 'webdev', 'techsupport', 'web_design', 'gadgets', 'learnprogramming'])
tech_df = tech_reddit.posts

# Sports Data

sports_reddit = Reddit(user_name, password, client_id, secret_key, user_agent)
sports_reddit.get_posts('Sports', ['sports', 'nba', 'soccer', 'nfl', 'baseball', 'hockey'])
sports_df = sports_reddit.posts

# Travel Data

travel_reddit = Reddit(user_name, password, client_id, secret_key, user_agent)
travel_reddit.get_posts('Travel', ['Shoestring', 'travel', 'wanderlust', 'solotravel'])
travel_df = travel_reddit.posts

# Art Data

art_reddit = Reddit(user_name, password, client_id, secret_key, user_agent)
art_reddit.get_posts('Arts', ['ArtFundamentals', 'Art' , 'Sketchpad', 'ArtStore', 'ArtTools'])
art_df = art_reddit.posts

# Music Data

music_reddit = Reddit(user_name, password, client_id, secret_key, user_agent)
music_reddit.get_posts('Music', ['Music', 'ListenToThis', 'DubStep', 'HipHopHeads', 'Guitar', 'Electronics', 'Vinyl'])
music_df = music_reddit.posts

In [None]:
combined_data = pd.concat([tech_df, sports_df, travel_df, art_df, music_df])
combined_data.to_csv('datasets/reddit-categories1.csv')

## Data Cleaning

Backup cleaning functions

In [None]:
punctuations = string.punctuation
stopwords = nltk.corpus.stopwords.words('english')

def remove_punc(text):
    return ''.join([word for word in text if word not in punctuations])

def remove_stopwords(text):
    return [word for word in text.split() if word not in stopwords]

def convert_lowercase(text):
    return ''.join([word.lower() for word in text])

def preprocess(text):
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = remove_punc(text)
    text = convert_lowercase(text)
    text = remove_stopwords(text)
    return text

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def spacy_lemmatize(text):
    if type(text) == list:
        doc = nlp(u"{}".format(' '.join(text)))
    else:
        doc = nlp(u"{}".format(text))
    lemmatized = list()
    for token in doc:
        lemmatized.append(token.lemma_)
    
    return lemmatized

In [None]:
reddit = pd.read_csv('datasets/reddit-categories1.csv')
reddit.drop('Unnamed: 0', axis=1, inplace=True)
reddit.fillna('', inplace=True)
reddit['full-text'] = reddit['title'] + reddit['body']
reddit['clean-text'] = reddit['full-text'].map(preprocess).map(spacy_lemmatize).apply(lambda x: ' '.join([word for word in x if word not in punctuations])).map(str)
data = reddit[['category', 'clean-text']]
data.to_csv('datasets/train-test-data.csv')

## Data Modelling

In [None]:
X = data['clean-text']
y = data['category']

le = LabelEncoder()
y = le.fit_transform(y)
ref = set(zip(data['category'].to_numpy(), y))

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y)
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
nb = MultinomialNB()
nb.fit(X_train, y_train)
X_test = tfidf.transform(X_test)
nb.score(X_test, y_test)

In [None]:
tests = ['Such a peaceful retreat, close to the mountains and close to the ocean as well. The perfect combination', 
         "Our home for the next 5 nights in Albania", 
        "Very pretty little Pandora Clock, with the large Z566M Nixie tubes. Made out of an old Victorian sewing box, light sensor hidden in the keyhole to dim the tubes at night!", #misclassfied, should be more arts, not music 
        "It's an honor that I've been chosen as the top 10 in the Art competition.#arts #art #artaccount #artist  #paint I I'm grateful for this opportunity. I honestly didn't expect to be selected but I'm glad I joined this. There's still more to go", #hashtags really help
        "No one would've batted an eye if Odell Beckham started camp on the PUP list. That's the normal time table.Odell beat it.",
        'Reverse engineering generative models by Facebook AI can identify deepfakes and track their origin! Cool, eh? We need that for all sorts of deepfakes, especially audio',
        '"WONDER" By me :)Yet another song I made, hope you guys like it!(Soundcloud link in thread)',
        'meet me where the #music meets the #sea....',
        'A winning sunset  #vacation', #without hashtag, classified as art
        'Happy #HumpDay from the beach Umbrella on ground Wednesday night moon',
        'Such a peaceful retreat, close to the mountains and close to the ocean as well. The perfect combination',
        'NEW VIDEO - The iPhone 13 Models!',
        'Tesla Model S PLAID Impressions: Re-Inventing the Wheel!'
        ]

In [None]:
tests_full = ['Such a peaceful retreat, close to the mountains and close to the ocean as well. The perfect combination', 
         "Our home for the next 5 nights in Albania", 
        "No one would've batted an eye if Odell Beckham started camp on the PUP list. That's the normal time table.Odell beat it.",
        'Happy #HumpDay from the beach Umbrella on ground Wednesday night moon',
        'Such a peaceful retreat, close to the mountains and close to the ocean as well. The perfect combination',
        'NEW VIDEO - The iPhone 13 Models!',
        'Tesla Model S PLAID Impressions: Re-Inventing the Wheel!'
        ]

In [None]:
user_input = ' '.join(tests_full)
user_input = preprocess(user_input)
user_input = spacy_lemmatize(user_input)
user_input = ' '.join(user_input)

In [None]:
user_input = tfidf.transform([user_input])

In [None]:
nb.predict_proba(user_input)