In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime

In [2]:
english_stopwords = set(stopwords.words('english') + list(punctuation) + ['..','...','nbsp','n\'t'])

In [40]:
lemmatizer = WordNetLemmatizer()

In [22]:
df = pd.read_csv('blogtext.csv')

In [53]:
#Gathering blogs by all genders between 20-29
new_df = df[(df['age'] > 19) & (df['age'] < 30)]

In [54]:
sorted_df = new_df.groupby('id').agg({
    'id': 'first',
    'gender': 'first',
    'age': 'first',
    'text': '\n'.join
})

In [55]:
sorted_df

Unnamed: 0_level_0,id,gender,age,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5114,5114,male,25,Slashdot raises lots of urlLink ...
7596,7596,male,26,Every day should be a half day. Took t...
8349,8349,male,24,"Ever wondered what a urlLink Bad head,..."
9289,9289,male,23,testing \n wi...
9470,9470,male,25,Yet another TV Channel. This time it's ...
...,...,...,...,...
4335239,4335239,female,27,Have you been watching the Olympics? I...
4335684,4335684,male,27,"Ok, while driving fast up I-88, I w..."
4336267,4336267,male,27,Will discuss Spy Fly which is being dev...
4336547,4336547,male,23,"Well, my first post. I guess it would ..."


In [56]:
def manipulate_text(text):
    text = text.lower()
    toke_text = word_tokenize(text)
    wordlist = []
    for word in toke_text:
        if word not in english_stopwords:
            wordlist.append(word)
    wordlist2 = []
    for eachword in wordlist:
        wordlist2.append(lemmatizer.lemmatize(eachword))
    text = [' '.join(wordlist2)]
    text = str(text)
    return text

In [57]:
sorted_df['text'] = sorted_df['text'].apply(manipulate_text)

In [58]:
sorted_df

Unnamed: 0_level_0,id,gender,age,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5114,5114,male,25,"[""slashdot raise lot urllink interesting thoug..."
7596,7596,male,26,"[""every day half day took afternoon hit dentis..."
8349,8349,male,24,"[""ever wondered urllink bad head head crash ha..."
9289,9289,male,23,"[""testing every passing day hard imagine going..."
9470,9470,male,25,"[""yet another tv channel time 's called zoom c..."
...,...,...,...,...
4335239,4335239,female,27,"[""watching olympics got really nervous 100m hu..."
4335684,4335684,male,27,"[""ok driving fast i-88 thinking thinking heave..."
4336267,4336267,male,27,"[""discus spy fly developed university californ..."
4336547,4336547,male,23,"[""well first post guess would proper start tel..."


In [59]:
sorted_df['GenderCode'] = sorted_df['gender'].apply(lambda x: 1 if x == 'female' else 0)

In [60]:
sorted_df

Unnamed: 0_level_0,id,gender,age,text,GenderCode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5114,5114,male,25,"[""slashdot raise lot urllink interesting thoug...",0
7596,7596,male,26,"[""every day half day took afternoon hit dentis...",0
8349,8349,male,24,"[""ever wondered urllink bad head head crash ha...",0
9289,9289,male,23,"[""testing every passing day hard imagine going...",0
9470,9470,male,25,"[""yet another tv channel time 's called zoom c...",0
...,...,...,...,...,...
4335239,4335239,female,27,"[""watching olympics got really nervous 100m hu...",1
4335684,4335684,male,27,"[""ok driving fast i-88 thinking thinking heave...",0
4336267,4336267,male,27,"[""discus spy fly developed university californ...",0
4336547,4336547,male,23,"[""well first post guess would proper start tel...",0


In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(sorted_df.text, sorted_df.GenderCode, random_state=1)

In [62]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [63]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, Y_train)
predictions = naive_bayes.predict(X_test_tf)

In [64]:
print('Accuracy: ', accuracy_score(Y_test, predictions))

Accuracy:  0.7111770524233432
