# Data preparation

In [90]:
import pandas as pd
df_python = pd.read_csv('data\python_posts.csv')
df_python['tag'] = 0
df_java = pd.read_csv('data\java_posts.csv')
df_java['tag'] = 1
frames = [df_python, df_java]
df = pd.concat(frames)
df = df.dropna()
df.head()

Unnamed: 0,id,title,content,tag
0,67371,Silly Name Generator,\n\n\n\n Table of Contents\n\n\n Introduction...,0
1,67290,Profiling: check how long it takes to run a Py...,\nFrom Python’s official documentation: “A pro...,0
2,67273,First Adventure in Malware Data Science,\nI haven’t been a software developer for very...,0
3,67057,PyKup - Web App backup manager,\nGitHub Link\n\n\n\n PyKup - WebApp backup m...,0
4,67121,How to reverse a list in Python,\n\n\n\n\nChristian Barra\n@christianbarra\n\n...,0


## create tokens

In [91]:
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

def tokenize(row):
    row['tokens'] = tokenizer.tokenize(row['content'])
    return row

df = df.apply(tokenize, axis=1)
df.head()
# tokenizer.tokenize("zzz zz")

Unnamed: 0,id,title,content,tag,tokens
0,67371,Silly Name Generator,\n\n\n\n Table of Contents\n\n\n Introduction...,0,"[Table, of, Contents, Introduction, Name, List..."
1,67290,Profiling: check how long it takes to run a Py...,\nFrom Python’s official documentation: “A pro...,0,"[From, Python’s, official, documentation:, “A,..."
2,67273,First Adventure in Malware Data Science,\nI haven’t been a software developer for very...,0,"[I, haven’t, been, a, software, developer, for..."
3,67057,PyKup - Web App backup manager,\nGitHub Link\n\n\n\n PyKup - WebApp backup m...,0,"[GitHub, Link, PyKup, -, WebApp, backup, manag..."
4,67121,How to reverse a list in Python,\n\n\n\n\nChristian Barra\n@christianbarra\n\n...,0,"[Christian, Barra, @christianbarra, How, to, r..."


## remove stopwords

In [92]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(row):
    stop_words = stopwords.words('english')
    row['tokens'] = [w for w in row['tokens'] if not w in stop_words] 
    return row
    
df = df.apply(remove_stopwords, axis=1)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukasz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,title,content,tag,tokens
0,67371,Silly Name Generator,\n\n\n\n Table of Contents\n\n\n Introduction...,0,"[Table, Contents, Introduction, Name, Lists, F..."
1,67290,Profiling: check how long it takes to run a Py...,\nFrom Python’s official documentation: “A pro...,0,"[From, Python’s, official, documentation:, “A,..."
2,67273,First Adventure in Malware Data Science,\nI haven’t been a software developer for very...,0,"[I, haven’t, software, developer, long,, I, en..."
3,67057,PyKup - Web App backup manager,\nGitHub Link\n\n\n\n PyKup - WebApp backup m...,0,"[GitHub, Link, PyKup, -, WebApp, backup, manag..."
4,67121,How to reverse a list in Python,\n\n\n\n\nChristian Barra\n@christianbarra\n\n...,0,"[Christian, Barra, @christianbarra, How, rever..."


## lowercase and stemming

In [93]:
import nltk

porter = nltk.PorterStemmer()

def simplify(row):
    row['tokens'] = [porter.stem(token) for token in row['tokens']] 
    return row
    
df = df.apply(simplify, axis=1)
df.head()

Unnamed: 0,id,title,content,tag,tokens
0,67371,Silly Name Generator,\n\n\n\n Table of Contents\n\n\n Introduction...,0,"[tabl, content, introduct, name, list, first, ..."
1,67290,Profiling: check how long it takes to run a Py...,\nFrom Python’s official documentation: “A pro...,0,"[from, python’, offici, documentation:, “A, pr..."
2,67273,First Adventure in Malware Data Science,\nI haven’t been a software developer for very...,0,"[I, haven’t, softwar, develop, long,, I, enjoy..."
3,67057,PyKup - Web App backup manager,\nGitHub Link\n\n\n\n PyKup - WebApp backup m...,0,"[github, link, pykup, -, webapp, backup, manag..."
4,67121,How to reverse a list in Python,\n\n\n\n\nChristian Barra\n@christianbarra\n\n...,0,"[christian, barra, @christianbarra, how, rever..."


# Vectorization

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
df['features'] = df.apply(lambda row: ' '.join(row['tokens']), axis=1)
X = cv.fit_transform(df['features'])
X.shape

(587, 20256)

In [95]:
from sklearn.model_selection import train_test_split 
y = df['tag']
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training

In [96]:
from sklearn.naive_bayes import GaussianNB
bayes = GaussianNB()
model = bayes.fit(X_train.toarray(), y_train)

# Scoring

In [97]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test.toarray())
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.77      0.76      0.77        76
          1       0.75      0.76      0.76        71

avg / total       0.76      0.76      0.76       147

