# Data preparation

In [None]:
import pandas as pd
df_python = pd.read_csv('data\python_posts.csv')
df_python['tag'] = 0
df_java = pd.read_csv('data\java_posts.csv')
df_java['tag'] = 1
frames = [df_python, df_java]
df = pd.concat(frames)
df = df.dropna()
df.head()

## languages
Filter out other languages

In [None]:
from langdetect import detect
from langdetect.detector import LangDetectException

def lang_for_row(row):
    lang = None
    try:
        lang = detect(row["content"])
    except LangDetectException as e:
#         print("caught")
    return lang
df['lang'] = df.apply(lang_for_row, axis=1)
df.head()
before = len(df.index)
df = df[df['lang'] == 'en']
after = len(df.index)
print("removed %d rows because detected as not English" % (before-after))

## create tokens

In [None]:
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

def tokenize(row):
    tokens = tokenizer.tokenize(row['content'])
    row['tokens'] = [str.lower(token) for token in tokens]
    return row

df = df.apply(tokenize, axis=1)
df.head()
# tokenizer.tokenize("zzz zz")

## remove stopwords

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(row):
    stop_words = stopwords.words('english')
    row['tokens'] = [w for w in row['tokens'] if not w in stop_words] 
    return row
    
df = df.apply(remove_stopwords, axis=1)
df.head()

## [stemming]

In [None]:
import nltk

porter = nltk.PorterStemmer()

def simplify(row):
    row['tokens'] = [porter.stem(token) for token in row['tokens']] 
    return row
    
# df = df.apply(simplify, axis=1)
# df.head()

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
# vectorizer = HashingVectorizer(alternate_sign=False, norm='l2', binary=False)
vectorizer = TfidfVectorizer(max_df=0.5, min_df=4, use_idf=True)
df['features'] = df.apply(lambda row: ' '.join(row['tokens']), axis=1)
X = vectorizer.fit_transform(df['features'])
df.head()
X.shape

In [None]:
# from sklearn.model_selection import train_test_split 
# y = df['tag']
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Training

In [None]:
# from sklearn.naive_bayes import GaussianNB
# bayes = GaussianNB()
# model = bayes.fit(X_train.toarray(), y_train)

# Scoring

In [None]:
# from sklearn.metrics import classification_report
# y_pred = model.predict(X_test.toarray())
# print(classification_report(y_test, y_pred))

# Clustering

## Normalization

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

svd = TruncatedSVD(1000)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
# lsa = make_pipeline(normalizer)


X_norm = lsa.fit_transform(X)
# X_norm = X

## Clustering work

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

K = range(1,30,3)
distortions = []
for k in K:
    kmeans_for_k = KMeans(n_clusters=k,  random_state=0, n_jobs=-1, verbose=2).fit(X)
#     kmeans = KMeans(n_clusters=NO_OF_CLUSTERS, init='k-means++', max_iter=100, n_init=50, random_state=0, n_jobs=-1, verbose=2).fit(X_norm)
    distortion = kmeans_for_k.inertia_
    distortions.append(distortion)
    

#     y_pred = kmeans.predict(X_norm)
# y_pred_proba = kmeans.predict_proba(X_norm)


plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of Squared Errors')
plt.title('The Elbow Method showing the optimal k')
plt.show()



## Centroids

In [None]:
NO_OF_CLUSTERS=2
TOP_ELEMENT_COUNT=40
kmeans_for_k = KMeans(n_clusters=NO_OF_CLUSTERS,  random_state=0, n_jobs=-1, verbose=2).fit(X)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
centroids = kmeans_for_k.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for n in range(NO_OF_CLUSTERS):
    print("cluster %d" % n)
    elems = [terms[index] for index in centroids[n, :TOP_ELEMENT_COUNT]]
    wc = WordCloud().generate(' '.join(elems))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()

# Visualization

In [None]:
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA

# tsne = TSNE(n_components=2, init='pca', random_state=0)
# X_tsne = tsne.fit_transform(X_norm.toarray())
# df['x-tsne'] = X_tsne[:,0]
# df['y-tsne'] = X_tsne[:,1]
# df['prediction'] = y_pred
