In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

# Data preparation

In [None]:
def read_from_files():
    df_python = pd.read_csv('data\python_posts.csv')
    df_python['tag'] = 0
    df_python['tag_text'] = 'python'
    df_java = pd.read_csv('data\java_posts.csv')
    df_java['tag'] = 1
    df_java['tag_text'] = 'java'
    df_javascript = pd.read_csv('data\javascript_posts.csv')
    df_javascript['tag'] = 2
    df_javascript['tag_text'] = 'javascript'
    df_devops = pd.read_csv('data\devops_posts.csv')
    df_devops['tag'] = 3
    df_devops['tag_text'] = 'devops'
    frames = [df_python, df_java, df_javascript, df_devops]
    df = pd.concat(frames)
    df = df.dropna()
    df = df.set_index('id')
    return df

# df = read_from_files()
# df.head()

## languages
Filter out other languages

In [None]:
from langdetect import detect
from langdetect.detector import LangDetectException

def remove_non_englis_posts(df):
    def lang_for_row(row):
        lang = None
        try:
            lang = detect(row["content"])
        except LangDetectException as e:
            pass
    #         print("caught")
        return lang
    df['lang'] = df.apply(lang_for_row, axis=1)
    df.head()
    before = len(df.index)
    df = df[df['lang'] == 'en']
    after = len(df.index)
    print("removed %d rows because detected as not English" % (before-after))
    return df

import pickle
# with open('data/posts.pickle', 'wb') as f:
#     pickle.dump(df, f)
def read_from_pickle():
    with open('data/posts.pickle', 'rb') as f:
        return pickle.load(f)
    
df = read_from_pickle()
df.head()

## create tokens

In [None]:
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

def tokenize(row):
    tokens = tokenizer.tokenize(row['content'])
    row['tokens'] = [str.lower(token) for token in tokens]
    return row

df = df.apply(tokenize, axis=1)
df.describe()
# tokenizer.tokenize("zzz zz")

## remove stopwords

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(row):
    stop_words = stopwords.words('english')
    row['tokens'] = [w for w in row['tokens'] if not w in stop_words] 
    return row
    
df = df.apply(remove_stopwords, axis=1)
df.head()

## [stemming]

In [None]:
import nltk

porter = nltk.PorterStemmer()

def simplify(row):
    row['tokens'] = [porter.stem(token) for token in row['tokens']] 
    return row
    
df = df.apply(simplify, axis=1)
df.head()

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
# vectorizer = HashingVectorizer(alternate_sign=False, norm='l2', binary=False)
vectorizer = TfidfVectorizer(max_df=0.5, min_df=4, use_idf=True)
df['features'] = df.apply(lambda row: ' '.join(row['tokens']), axis=1)
vectorizer = vectorizer.fit(df['features'])
X = vectorizer.transform(df['features'])
df.head()
X.shape
# X[0]

# Clustering

## Normalization

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

kbest = SelectKBest(k=100)
y = df['tag']

normalizer = Normalizer()

pipeline = Pipeline([('kbest', kbest), ('norm', normalizer)])

X_norm = pipeline.fit_transform(X, y)

## Clustering work

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

def elbow(X):
    K = range(1,30,3)
    distortions = []
    for k in K:
        kmeans_for_k = KMeans(n_clusters=k, random_state=0, n_jobs=-1, verbose=2).fit(X)
    #     kmeans = KMeans(n_clusters=NO_OF_CLUSTERS, init='k-means++', max_iter=100, n_init=50, random_state=0, n_jobs=-1, verbose=2).fit(X_norm)
        distortion = kmeans_for_k.inertia_
        distortions.append(distortion)


    #     y_pred = kmeans.predict(X_norm)
    
    # y_pred_proba = kmeans.predict_proba(X_norm)


    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of Squared Errors')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()
elbow(X_norm)

## Centroids

In [None]:
NO_OF_CLUSTERS=4
TOP_ELEMENT_COUNT=60

def show_centroids(X):
    kmeans_for_k = KMeans(n_clusters=NO_OF_CLUSTERS,  random_state=0, n_jobs=-1, verbose=2).fit(X)
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    centroids = kmeans_for_k.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for n in range(NO_OF_CLUSTERS):
        print("cluster %d" % n)
        elems = [terms[index] for index in centroids[n, :TOP_ELEMENT_COUNT]]
        wc = WordCloud().generate(' '.join(elems))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        plt.show()
        
show_centroids(X_norm)

# Visualization

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# tsne = TSNE(n_components=2, init='pca', random_state=0)
# X_tsne = tsne.fit_transform(X.toarray())
# df['x-tsne'] = X_tsne[:,0]
# df['y-tsne'] = X_tsne[:,1]

In [None]:
# df['cluster'] = kmeans_for_k.predict(X)
# sns.scatterplot(data=df, x='x-tsne', y='y-tsne', hue='cluster')

# Classify

## Train

In [None]:
y = df.tag

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

def get_bayes(X, y):
    from sklearn.naive_bayes import GaussianNB
    bayes = GaussianNB()
    model = bayes.fit(X, y)
    return model

classifier = get_bayes(X_train.toarray(), y_train)
classifier

## Score

In [None]:
def score_classifier(classifier, X_test, y_test):
    from sklearn.metrics import classification_report, confusion_matrix
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred, labels=[0,1]))
score = score_classifier(classifier, X_test.toarray(), y_test)

In [None]:
score = score_classifier(classifier, X_test.toarray(), y_test)

## Reduce dimensions

In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

def pca_for_range(X, counts):
    scores = []
    for count in counts:
        pca = PCA(n_components=count, random_state=0)
#         print("Before dim reduction shape is (%d,%d)" % X.shape)
        X_reduced = pca.fit_transform(X.toarray())
        print("After dim reduction shape is (%d,%d)" % X_reduced.shape)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, random_state=0)
        classifier = get_bayes(X_train, y_train)
        y_pred = classifier.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)

    plt.plot(counts, scores)

Trying to find optimal number of components.
First we perform w big range search, and then more granular one.

In [None]:
pca_for_range(X, range(1,100,5))

In [None]:
pca_for_range(X, range(10,30,1))

Optimal number of components seems to be 21.

In [None]:
OPTIMAL_COMPONENTS = 21
pca = PCA(n_components=OPTIMAL_COMPONENTS)
pca.fit(X.toarray())

We can learn about most influential feauters for every principal component.  In case of our dataset it means that we had the most characteristic words as for a component

In [None]:
feats = vectorizer.get_feature_names()
dfcomponents = pd.DataFrame(columns=feats, data=pca.components_)
dfcomponents.idxmax(axis=1)

In [None]:
X_pca = pca.transform(X.toarray())
pca2 = PCA(n_components=2, random_state=0)
X_pca2 = pca2.fit_transform(X_pca)

kmeans_for_k.fit(X_pca2)
df['cluster'] = kmeans_for_k.predict(X_pca2)
df['pca1'] = X_pca2[:,0]
df['pca2'] = X_pca2[:,1]
# df['pca1'] = y[0]b

In [None]:
sns.scatterplot(data=df, x='pca1', y='pca2', hue='cluster')

In [None]:
# df[df['cluster'] != df['tag']].count()
df['ok'] = df.apply(lambda x:x['cluster'] != x['tag'], axis=1)
# df.plot()
df.groupby(['tag_text','ok'])['tag'].count()

## Train after PCA

In [None]:
y = df['tag']
X_train,X_test,y_train,y_test = train_test_split(X_pca, y)
print("will train on features of a space (%d,%d)" % X_pca.shape)
bayes = get_bayes(X_train, y_train)
score_classifier(bayes, X_test, y_test)

In [None]:
K_FOLDS = 10
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=K_FOLDS)
# ydf = pd.DataFrame()
# ydf['label'] = y

y = np.array(y)
def bayes_kfolded(X, y):
    scores = []
    for train_index, test_index in skf.split(X_pca, y):
        X_train, X_test = X[train_index].toarray(), X[test_index].toarray()
        y_train, y_test = y[train_index], y[test_index]
        bayes = get_bayes(X_train, y_train)
        scores.append(bayes.score(X_test, y_test))
    avg_score = np.average(scores)
    print("Avg score for %d folds is %.2f" %(K_FOLDS, avg_score))
    
# len(y)
# y[1]
# y
bayes_kfolded(X, y)
# ydf
# type(y)
# df.index.duplicated()


In [None]:
def predict_text(text):
    print(text)
    X = vectorizer.transform([text])
    X_pca = pca.transform(X.toarray())
    proba = bayes.predict_proba(X_pca)
    print(proba[0]*100)
    

predict_text("I love java")
predict_text("I love python")
predict_text("I am using django because it is easy")
predict_text("in this button you have to write ajax callback and the page is loading")
predict_text("install on the server as root using container")

print("0 is python, 1 is java")