# TODO
- Analyze the words with large weights in ols, lasso and ridge
- Analyze the important words from RF
- Analyze the words for observations that were misclassified under "Examine Misclassified Words for Positive Tweets" and "Examine Missclassified Words for Negative Tweets"
    - I created `pos_counts_dict` and `neg_counts_dict` as starting points (will need to convert into list or data frame depending on what analysis is being done)
    - Maybe make a word cloud? Although we already have other word clouds so maybe something else? Not sure what are good visualizations for words so maybe google that?
    - Also on that note, if you find a visualization that would make more sense to do instead of a word cloud for any of the word clouds in DataExploration.ipynb, please feel free to do that :) 
- Do similar analysis as above with the correctly classified stuff
- Add more to the proportion stuff for both above since I just made the dataframe but then didn't do much after that
- Try to do something with `lsvc_test`? IDK what to do 
- Anything else you can think of :) 

# Analyze Machine Learning Results

In [1]:
# global imports
import pyspark
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as W

spark = SparkSession.builder.getOrCreate()

# Import Data

In [2]:
# import data
%store -r ml_dfs

In [3]:
# linear support vector machine results
lsvc_test = spark.createDataFrame(ml_dfs[0])
lsvc_test.show(2)

+--------------------+---------+--------------------+----------+
|           count_vec|label_idx|       rawPrediction|prediction|
+--------------------+---------+--------------------+----------+
|(349,[2,58,118,12...|      0.0|[0.76933931432304...|       0.0|
|(349,[0,4,134,186...|      1.0|[-0.4556061437591...|       1.0|
+--------------------+---------+--------------------+----------+
only showing top 2 rows



In [4]:
# all machine learning results
results = spark.createDataFrame(ml_dfs[1])
results.show(2)

+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
|           count_vec|label_idx|rf_pred| id|gbt_pred|nb_pred|ols_pred|lasso_pred|ridge_pred|lsvc_pred|
+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
|(349,[2,58,118,12...|      0.0|    0.0|  1|     0.0|    0.0|     0.0|       0.0|       0.0|      0.0|
|(349,[0,4,134,186...|      1.0|    0.0|  2|     1.0|    1.0|     1.0|       1.0|       1.0|      1.0|
+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
only showing top 2 rows



In [5]:
# import data
%store -r dfs

In [6]:
# import vocabulary
vocab = dfs[3]

# Misclassified Observations by Every Algorithm

In [7]:
# create dataframe of observations that were always misclassified
misclassified = results.where((W.col('label_idx')!=W.col('rf_pred')) &
                             (W.col('label_idx')!=W.col('gbt_pred')) &
                             (W.col('label_idx')!=W.col('nb_pred')) &
                             (W.col('label_idx')!=W.col('ols_pred')) &
                             (W.col('label_idx')!=W.col('lasso_pred')) &
                             (W.col('label_idx')!=W.col('ridge_pred')) &
                             (W.col('label_idx')!=W.col('lsvc_pred')))
# output dataframe
misclassified.show(2)

+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
|           count_vec|label_idx|rf_pred| id|gbt_pred|nb_pred|ols_pred|lasso_pred|ridge_pred|lsvc_pred|
+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
|(349,[1,110,280,3...|      1.0|    0.0|  3|     0.0|    0.0|     0.0|       0.0|       0.0|      0.0|
|(349,[2,11,13,15,...|      1.0|    0.0| 20|     0.0|    0.0|     0.0|       0.0|       0.0|      0.0|
+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
only showing top 2 rows



## Examine Misclassified Labels

In [8]:
# get total counts
total_counts = results.groupby('label_idx').count()
total_counts.show()

+---------+-----+
|label_idx|count|
+---------+-----+
|      0.0| 2779|
|      1.0| 1593|
+---------+-----+



In [9]:
# get proportion misclassified
mis_counts = misclassified.groupby('label_idx').count()
mis_counts = mis_counts.withColumnRenamed('label_idx','label_idx2')
mis_counts = mis_counts.withColumnRenamed('count','miscount')
mis_counts = mis_counts.join(total_counts,total_counts.label_idx==mis_counts.label_idx2)
mis_counts = mis_counts.drop('label_idx2')
mis_counts = mis_counts.withColumn('proportion',W.col('miscount')/W.col('count'))
mis_counts.show()

+--------+---------+-----+--------------------+
|miscount|label_idx|count|          proportion|
+--------+---------+-----+--------------------+
|      19|      0.0| 2779|0.006836991723641598|
|     349|      1.0| 1593| 0.21908349026993096|
+--------+---------+-----+--------------------+



## Examine Misclassified Words for Positive Tweets

In [10]:
# function for creating key and value pair of a row
def key_val(row):
    new_row = [(i,row[i]) for i in range(len(row))]
    return new_row

In [11]:
# create dictionary of indices and word counts
mis_pos_counts_dict = misclassified.where(W.col('label_idx')==1)\
                                   .select('count_vec').rdd\
                                   .map(lambda row: row.count_vec.toArray())\
                                   .map(lambda row: key_val(row))\
                                   .flatMap(lambda row: row)\
                                   .reduceByKey(lambda x,y: x+y)\
                                   .mapValues(lambda x: int(x))\
                                   .collectAsMap()

In [12]:
# update the dictionary so the keys correspond to words
for i in range(0,len(vocab)):
    mis_pos_counts_dict[vocab[i]] = mis_pos_counts_dict.pop(i)

## Examine Misclassified Words for Negative Tweets

In [13]:
# create dictionary of indices and word counts
mis_neg_counts_dict = misclassified.where(W.col('label_idx')==0)\
                                   .select('count_vec').rdd\
                                   .map(lambda row: row.count_vec.toArray())\
                                   .map(lambda row: key_val(row))\
                                   .flatMap(lambda row: row)\
                                   .reduceByKey(lambda x,y: x+y)\
                                   .mapValues(lambda x: int(x))\
                                   .collectAsMap()

In [14]:
# update the dictionary so the keys correspond to words
for i in range(0,len(vocab)):
    mis_neg_counts_dict[vocab[i]] = mis_neg_counts_dict.pop(i)

# Correctly Classified Observations by Every Algorithm

In [15]:
# create dataframe of observations that were always misclassified
correct = results.where((W.col('label_idx')==W.col('rf_pred')) &
                             (W.col('label_idx')==W.col('gbt_pred')) &
                             (W.col('label_idx')==W.col('nb_pred')) &
                             (W.col('label_idx')==W.col('ols_pred')) &
                             (W.col('label_idx')==W.col('lasso_pred')) &
                             (W.col('label_idx')==W.col('ridge_pred')) &
                             (W.col('label_idx')==W.col('lsvc_pred')))
# output dataframe
correct.show(2)

+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
|           count_vec|label_idx|rf_pred| id|gbt_pred|nb_pred|ols_pred|lasso_pred|ridge_pred|lsvc_pred|
+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
|(349,[2,58,118,12...|      0.0|    0.0|  1|     0.0|    0.0|     0.0|       0.0|       0.0|      0.0|
|(349,[1,6,13,18,2...|      0.0|    0.0|  4|     0.0|    0.0|     0.0|       0.0|       0.0|      0.0|
+--------------------+---------+-------+---+--------+-------+--------+----------+----------+---------+
only showing top 2 rows



## Examine Correctly Classified Labels

In [16]:
# get proportion misclassified
correct_counts = correct.groupby('label_idx').count()
correct_counts = correct_counts.withColumnRenamed('label_idx','label_idx2')
correct_counts = correct_counts.withColumnRenamed('count','miscount')
correct_counts = correct_counts.join(total_counts,total_counts.label_idx==correct_counts.label_idx2)
correct_counts = correct_counts.drop('label_idx2')
correct_counts = correct_counts.withColumn('proportion',W.col('miscount')/W.col('count'))
correct_counts.show()

+--------+---------+-----+-------------------+
|miscount|label_idx|count|         proportion|
+--------+---------+-----+-------------------+
|    2130|      0.0| 2779| 0.7664627563871896|
|     214|      1.0| 1593|0.13433772755806653|
+--------+---------+-----+-------------------+



## Examine Correctly Classified Words for Positive Tweets

In [17]:
# create dictionary of indices and word counts
correct_pos_counts_dict = correct.where(W.col('label_idx')==1)\
                                 .select('count_vec').rdd\
                                 .map(lambda row: row.count_vec.toArray())\
                                 .map(lambda row: key_val(row))\
                                 .flatMap(lambda row: row)\
                                 .reduceByKey(lambda x,y: x+y)\
                                 .mapValues(lambda x: int(x))\
                                 .collectAsMap()

In [18]:
# update the dictionary so the keys correspond to words
for i in range(0,len(vocab)):
    correct_pos_counts_dict[vocab[i]] = correct_pos_counts_dict.pop(i)

## Examine Correctly Classified Words for Negative Tweets

In [19]:
# create dictionary of indices and word counts
correct_neg_counts_dict = correct.where(W.col('label_idx')==0)\
                                 .select('count_vec').rdd\
                                 .map(lambda row: row.count_vec.toArray())\
                                 .map(lambda row: key_val(row))\
                                 .flatMap(lambda row: row)\
                                 .reduceByKey(lambda x,y: x+y)\
                                 .mapValues(lambda x: int(x))\
                                 .collectAsMap()

In [20]:
# update the dictionary so the keys correspond to words
for i in range(0,len(vocab)):
    correct_neg_counts_dict[vocab[i]] = correct_neg_counts_dict.pop(i)