# ================================
# Building a Robot Judge - Assignment 1
## Matthäus Heer
# ================================

# Import section

In [None]:
%load_ext autoreload
%autoreload 2

import os
import random
from collections import Counter
from pprint import pprint
from textwrap import indent

import spacy
import nltk
from nltk.corpus import stopwords
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt

from context import robot_judge  # Now we can use all the handy robot_judge functionality :-)
from robot_judge.io import ProblemSet1Io
from robot_judge.nlp.language_models import spacy_nlp, stop_words
from robot_judge.utils import indent as indent_text

# How-To

1) In the repo root directory there is a folder called _data_.  
2) Go there and create a folder called *assignment_1* (or whatever the DATA_DIR_NAME variable below is assigned to).   
3) Place all cases files (*1936_X9VD8L.txt*, ...) in there.  
4) Place the *case_reversed.csv* file in there.  
5) Run the code below.  

In [None]:
N_SAMPLED_CASES = 10  # How many cases should be sampled to work on.
DATA_DIR_NAME = 'assignment_1'

# Task 1
## Compute sentece, words and letters count per document and plot vs year

In [None]:
from robot_judge.exploration.corpus_analysis import count_words_sents_letters
from robot_judge.exploration.corpus_analysis import visualize_counts

In [None]:
io = ProblemSet1Io(data_dir=DATA_DIR_NAME)
sampled_cases = io.read_multiple_cases_files(n_samples=N_SAMPLED_CASES)

In [None]:
# Takse ~40min for 1000 cases
%time labels, word_counts, sents_counts, letters_counts = count_words_sents_letters(sampled_cases)
years = [ProblemSet1Io.get_year_from_case_title(label) for label in labels]

In [None]:
visualize_counts(years, word_counts, sents_counts, letters_counts)

# Task 2
## Plot part-of-speech (POS) tagging frequency vs year

In [None]:
from robot_judge.exploration.corpus_analysis import get_pos_tags
from robot_judge.exploration.corpus_analysis import aggregate_avg_pos_tags
from robot_judge.exploration.corpus_analysis import visualize_avg_pos_vs_year

In [None]:
%time years, pos_tags = get_pos_tags(sampled_cases, ProblemSet1Io.get_year_from_case_title)

In [None]:
pos_df = aggregate_avg_pos_tags(years, pos_tags)

In [None]:
visualize_avg_pos_vs_year(pos_df)

# Task 3 
## Corpus normalization / cleaning & trigram creation

In [None]:
from robot_judge.nlp.ngrams import aggregate_clean_sentences, train_phrase_model, get_sents_from_sentence_dict
from gensim.models.phrases import Phrases, Phraser
from robot_judge.nlp.ngrams import print_label_sent_dict
from robot_judge.nlp import spacy_doc
from robot_judge.nlp.filter import token_is_punct_space

In [None]:
# case label keys, full text values
N_SAMPLED_CASES_FOR_NGRAMS = 10
test_corpus_dict = io.read_multiple_cases_files(N_SAMPLED_CASES_FOR_NGRAMS)

In [None]:
# case label keys, list (sentences) of lists (words)

# Takes around 10min for 1000 cases
%time unigram_sentences_dict = aggregate_clean_sentences(test_corpus_dict)

In [None]:
# list (sentences) of lists (words)
unigram_sentences = get_sents_from_sentence_dict(unigram_sentences_dict)

In [None]:
bigram_model = Phraser(train_phrase_model(unigram_sentences, min_count=2))

bigram_sentences = []
for sentence in unigram_sentences:
    bigram_sentences.append(bigram_model[sentence])
    
bigram_sentence_dict = {}
for label, sentences in unigram_sentences_dict.items():
    bigram_sentence_dict[label] = list(bigram_model[sentences])

In [None]:
trigram_model = Phraser(train_phrase_model(bigram_sentences, min_count=1))

trigram_sentences = []
for sentence in bigram_sentences:
    trigram_sentences.append(trigram_model[sentence])
    
trigram_sentence_dict = {}
for label, sentences in bigram_sentence_dict.items():
    trigram_sentence_dict[label] = list(trigram_model[sentences])

In [None]:
if True:  # for debugging
    print_label_sent_dict(trigram_sentence_dict)

# Task 4
## Create data frame of features

In [None]:
import itertools
from robot_judge.nlp.ngrams import create_df_from_label_sent_dict, get_most_common_words
from robot_judge.nlp.ngrams import get_labels_without_year
from robot_judge.nlp.ngrams import get_target_values

# Index is case label, columns are words, entries are counts
feat_df = create_df_from_label_sent_dict(trigram_sentence_dict)
target_labels = get_labels_without_year(feat_df)
target_values = get_target_values(target_labels)
feat_df.insert(0, '__case_reversed__', target_values)
feat_df.fillna(0.0, inplace=True)

most_common_words = get_most_common_words(trigram_sentence_dict, 1000)

In [None]:
feat_df = feat_df[most_common_words + ['__case_reversed__']]

y = feat_df['__case_reversed__']

X = feat_df.loc[:, feat_df.columns != '__case_reversed__']

X.head()

# Task 5
## Create training / test set

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
pipeline = Pipeline([('std_scaler', StandardScaler()),
                     ('log_regr', LogisticRegression())])

param_grid = {'log_regr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

clf = GridSearchCV(pipeline, param_grid=param_grid, cv=2, refit=True)
clf = clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print('Classification report:\n')
print(classification_report(y_test, y_pred))

# Task 6
## GridsearchCV and ROC / AUC evaluation

In [None]:
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.metrics import plot_roc
from robot_judge.ml import transform_to_text_label

In [None]:
y_true_labels = transform_to_text_label(y_test, 'not_reversed', 'reversed')
y_pred_labels = transform_to_text_label(y_pred, 'not_reversed', 'reversed')

_ = plot_confusion_matrix(y_true_labels, y_pred_labels, figsize=(8, 8), 
                          title="Confusion Matrix for held out test data.")

In [None]:
y_proba = clf.predict_proba(X_test)
_ = plot_roc(y_true_labels, y_proba, title='ROC Curves for held out test data', figsize=(8, 8), 
             plot_macro=False, plot_micro=False)
print('For AUC score, see plot.')

# Task 7 
## Vader compound sentiment scores

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
%%time 

def label_case_dict_to_sentence_list(label_case_dict):
    all_sentences = []
    for case_text in sampled_cases.values():
        for sentence in spacy_nlp(case_text).sents:
            all_sentences.append(sentence.text)
    return all_sentences

all_sentences = label_case_dict_to_sentence_list(sampled_cases)

In [None]:
sid = SentimentIntensityAnalyzer()

def calculate_sentiment_scores(sentences):
    sentence_scores = []
    for sentence in sentences:
        pol_score = sid.polarity_scores(sentence)
        
        score_dict = {key: value for key, value in pol_score.items()}
        score_dict['text'] = sentence
        
        sentence_scores.append(score_dict)
    return sentence_scores

sentence_scores = calculate_sentiment_scores(all_sentences[:1000])

sent_df = pd.DataFrame(sentence_scores).sort_values(by='pos', ascending=False)
print('Most POSITIVE sentences:')
for idx, sent in enumerate(sent_df['text'][:10]):
    print('\t', idx, sent)

sent_df = pd.DataFrame(sentence_scores).sort_values(by='neg', ascending=False)  
print('Most NEGATIVE sentences:')
for idx, sent in enumerate(sent_df['text'][:10]):
    print('\t', idx, sent)

# Task 8
## tf-idf vectorizer and sentence similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from robot_judge.utils.data_structs import sort_coo_matrix

In [None]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(all_sentences[:100])
X_tfidf_sparse = sparse.csr_matrix(X_tfidf)

In [None]:
# Calculates the ten highest cosine similarities
cos_sim = sparse.coo_matrix(sparse.tril(cosine_similarity(X_tfidf_sparse, dense_output=False)))
cos_sim = [sim for sim in sort_coo_matrix(cos_sim)]
cos_sim = [sim for sim in cos_sim if sim[0] != sim[1]]
cos_sim = [sim for sim in cos_sim if sim[2] < 1.0]

In [None]:
# Print sentence with high cos similarity

for idx_1, idx_2, score in cos_sim:  
    print('Cos sim score: {}\n'.format(score))
    print(all_sentences[idx_1])
    print(all_sentences[idx_2])
    print(20 * '-')

# Task 9
## K-meayns clustering of sentences

In [None]:
import collections
from sklearn.cluster import KMeans

In [None]:
def cluster_texts(tfidf_model, texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering


clusters = cluster_texts(X_tfidf, all_sentences[:100], 7)

def print_sampled_texts_from_clusters(clusters, sentences, n_sample_per_cluster):
    for cluster_id, text_indices in clusters.items():
        
        print('Cluster number {}:'.format(cluster_id))
        
        sample_indices = random.sample(text_indices, n_sample_per_cluster)
        
        for idx in sample_indices:
            print('\t', sentences[idx])
    
print_sampled_texts_from_clusters(clusters, all_sentences[:100], 3)