# Data Loading

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import sparse

training_set_dataframe = pd.read_csv("data/train_data.csv", encoding="utf8")
print(training_set_dataframe.head())

training_targets_dataframe = pd.read_csv("data/train_target.csv")
print(training_targets_dataframe.head())

training_set = list(training_set_dataframe.body)
print(training_set[:3])

training_targets_dictionary = dict(zip(training_targets_dataframe.author, training_targets_dataframe.gender))
training_targets = list(map(lambda a: training_targets_dictionary[a], training_set_dataframe.author))
training_targets[:10]

          author          subreddit   created_utc  \
0    Shamus_Aran       mylittlepony  1.388534e+09   
1       Riddance                sex  1.388534e+09   
2  Secret_Wizard       DragonsDogma  1.388534e+09   
3   Penultimatum  malefashionadvice  1.388534e+09   
4      7-SE7EN-7      todayilearned  1.388534e+09   

                                                body  
0  I don't think we'd get nearly as much fanficti...  
1  Thanks. I made it up, that's how I got over my...  
2  Are you sure you aren't confusing Cyclops (the...  
3                             dont do this to me bro  
4        That's what we do when we can't find a mate  
            author  gender
0     RedThunder90       0
1          Lirkmor       1
2           In0chi       0
3    ProjectGrudge       0
4  TehTurtleHermit       0
["I don't think we'd get nearly as much fanfiction and pictures shipping Ban-Ban and Lyro. Just saying.", "Thanks. I made it up, that's how I got over my first heart break. ", "Are you sure

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [25]:
def join_strings(x):
    return ','.join(x)

training_set_df_grouped = training_set_dataframe.groupby('author', as_index=False).agg({'subreddit':join_strings, 
                         'body':join_strings, 
                         'created_utc':'mean'})

#subreddits = training_set_df_grouped[['author', 'subreddit']]
training_set_df_grouped.iloc[:, [0,1,2]]

training_set2 = list(training_set_df_grouped.body)
print(training_set2[:3])

training_targets_dictionary = dict(zip(training_targets_dataframe.author, training_targets_dataframe.gender))
training_targets = list(map(lambda a: training_targets_dictionary[a], training_set_df_grouped.author))
training_targets[:1]

['Neil Diamond - Sweet Caroline,+1 on the chiropractor. I went religiously from ages 14 - 19 due to extreme lower back pain from running track / cross country. It\'s important to remember it\'s a somewhat slow process, but worth the results. \n\nOh, and don\'t worry, they aren\'t cracking your back - its just an "adjustment" ,Looks like the pattern on the envelope that middle school photos came in. ,If you had a bad motivator would you want to spell that out every time?,And nothing beats a hangover quite like bacon (or better yet, Taylor ham), eggs over easy, toast, and hash browns!,No, I think YOU mean Taylor Ham... Really, whatever you call it doesn\'t matter, its damn delicious. ', 'Just read the FAQ, really.', "I just received my Deathadder Black Edition yesterday and I just tried it out. Although I understand the Black Edition and the 2013 vary in terms of the coating on the mouse, I'm just going off with my experience of using it for an hour or so.\n\nOriginally I use a Logitech 

[0]

# Feature Engineering

In [None]:
import re
import nltk

def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

def Lemmatizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

Lemmatizer("My string input is are be were much better than yours was and they are mr person persons i despise I hate")

In [5]:
from collections import Counter
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
import nltk
text = "Guru99 is one of the best sites to his he they she them learn WEB, SAP, Ethical Hacking and much more online."
lower_case = text.lower()
tokens = nltk.word_tokenize(lower_case)
tags = nltk.pos_tag(tokens)
counts = Counter(tag for word, tag in tags)
print(counts)

vectorizer = DictVectorizer()
print(vectorizer.fit_transform(counts))

Counter({'PRP': 4, 'NN': 3, 'JJ': 3, ',': 2, 'VBZ': 1, 'CD': 1, 'IN': 1, 'DT': 1, 'JJS': 1, 'NNS': 1, 'TO': 1, 'PRP$': 1, 'VBP': 1, 'CC': 1, 'RB': 1, 'JJR': 1, '.': 1})
  (0, 0)	2.0
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	3.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 9)	3.0
  (0, 10)	1.0
  (0, 11)	4.0
  (0, 12)	1.0
  (0, 13)	1.0
  (0, 14)	1.0
  (0, 15)	1.0
  (0, 16)	1.0


In [26]:
def count_pos_tags(text):
    lower_case = text.lower()
    tokens = nltk.word_tokenize(lower_case)
    tags = nltk.pos_tag(tokens)
    counts = Counter(tag for word, tag in tags)
    total = sum(counts.values(), 0.0)
    normalized_counts = {k: v / total for k, v in counts.items()}
    return normalized_counts

training_vector = vectorizer.fit_transform(list(map(count_pos_tags, training_set2)))
print(list(enumerate(vectorizer.get_feature_names_out())))

KeyboardInterrupt: 

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

clf1 = LogisticRegression(C = 0.001, tol = 0.01, max_iter=3000, random_state=0, n_jobs=-1)
clf2 = RandomForestClassifier(max_depth=5, max_features='sqrt', n_jobs=-1)
clf3 = MultinomialNB()
#clf1.fit(training_vector, training_targets)
#clf1.score(training_vector, training_targets)
#clf2.fit(training_vector, training_targets)
#clf2.score(training_vector, training_targets)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)],
        voting='hard', n_jobs=-1)

scores = cross_val_score(eclf, training_vector, training_targets, scoring='accuracy', cv=5, n_jobs=-1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Accuracy: 0.77 (+/- 0.00)


In [178]:
print(list(enumerate(vectorizer.get_feature_names_out())))
colormap = np.array(['gray', 'g'])
fig, ax = plt.subplots()
ax.set_xscale('log')
ax.set_yscale('log')
for i in range(5):
    for j in range(5):
        if i > j:
            ax.scatter(training_vector[:30000,i].toarray(), training_vector[:30000,j].toarray(), s=100, c=colormap[training_targets[:30000]])
plt.show()

[(0, '#'), (1, '$'), (2, "''"), (3, '('), (4, ')'), (5, ','), (6, '.'), (7, ':'), (8, 'CC'), (9, 'CD'), (10, 'DT'), (11, 'EX'), (12, 'FW'), (13, 'IN'), (14, 'JJ'), (15, 'JJR'), (16, 'JJS'), (17, 'LS'), (18, 'MD'), (19, 'NN'), (20, 'NNP'), (21, 'NNPS'), (22, 'NNS'), (23, 'PDT'), (24, 'POS'), (25, 'PRP'), (26, 'PRP$'), (27, 'RB'), (28, 'RBR'), (29, 'RBS'), (30, 'RP'), (31, 'SYM'), (32, 'TO'), (33, 'UH'), (34, 'VB'), (35, 'VBD'), (36, 'VBG'), (37, 'VBN'), (38, 'VBP'), (39, 'VBZ'), (40, 'WDT'), (41, 'WP'), (42, 'WP$'), (43, 'WRB'), (44, '``')]


KeyboardInterrupt: 

In [162]:
import seaborn
vector = training_vector[:10].toarray()
toplot = pd.DataFrame(vector)

    0    1         2         3         4         5         6         7   \
0  0.0  0.0  0.000000  0.000000  0.000000  0.000000  0.095238  0.000000   
1  0.0  0.0  0.000000  0.000000  0.000000  0.055556  0.111111  0.000000   
2  0.0  0.0  0.000000  0.027027  0.027027  0.013514  0.027027  0.027027   
3  0.0  0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.0  0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
5  0.0  0.0  0.010417  0.024306  0.024306  0.031250  0.020833  0.010417   
6  0.0  0.0  0.000000  0.000000  0.000000  0.011628  0.046512  0.011628   
7  0.0  0.0  0.000000  0.000000  0.000000  0.000000  0.137931  0.034483   
8  0.0  0.0  0.000000  0.044444  0.044444  0.000000  0.066667  0.088889   
9  0.0  0.0  0.000000  0.000000  0.000000  0.000000  0.285714  0.000000   

         8         9   ...        35        36        37        38        39  \
0  0.095238  0.000000  ...  0.000000  0.095238  0.000000  0.047619  0.000000   
1  0.000000  0

KeyboardInterrupt: 

Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x000001A9C168C820> (for post_execute):


KeyboardInterrupt: 

Error in callback <function flush_figures at 0x000001A9C168C670> (for post_execute):


KeyboardInterrupt: 