# Create SparkSession, import packages

# To Do:
* combine the data with the features file, to get a header
* do variable selection and transformation
* spit out a final `.csv` to use for modeling

# Ideas
* look at the data via kmeans with 6 clusters?
* compute AUC on each variable to see which are important? (after regressing on the var)
* create variables that are combinations of other variables
* look for outliers and then run pca before clustering?
* run multinomial regression with elastic net? (with k-fold cv)
* or maybe try neural nets?
* something time series?


In [95]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, asc
import pyspark.sql.types as typ
import pyspark.sql.functions as F
import os
import pandas as pd
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext
sqlCtx = SQLContext(sc)

In [65]:
#header
pandas_header = pd.read_csv("Data/features.txt",sep='\s+',header=None)
pandas_header= pandas_header.drop(0, 1)
#conversion to spark df. this has an index column, so we deselect it
df_header = sqlCtx.createDataFrame(pandas_header)
#There are 561 counts corresponding to all the variables

In [74]:
#read in with pandas first since the data is weirdly delimited
pandas_train_features = pd.read_csv("Data/X_train.txt", sep='\s+',header=None)
#pandas_train_features.columns=pandas_header
#conversion to spark df
df_train_features = sqlCtx.createDataFrame(pandas_train_features)

In [3]:
#training labels
pandas_train_labels = pd.read_csv("Data/y_train.txt", sep='\s+',header=None)
#conversion to spark df
df_train_labels = sqlCtx.createDataFrame(pandas_train_labels)

In [4]:
#test features
pandas_test_features = pd.read_csv("Data/X_test.txt", sep='\s+',header=None)
#conversion to spark df
df_test_features = sqlCtx.createDataFrame(pandas_test_features)

In [5]:
#test labels
pandas_test_labels = pd.read_csv("Data/y_test.txt", sep='\s+',header=None)
#conversion to spark df
df_test_labels = sqlCtx.createDataFrame(pandas_test_labels)

In [109]:
#reading subject number in
panda_sub_train=pd.read_csv("Data/subject_train.txt",sep='\s+',header=None)
panda_sub_train.columns=['id']
panda_sub_test=pd.read_csv("Data/subject_test.txt",sep='\s+',header=None)
panda_sub_test.columns=['id']
df_sub_train=sqlCtx.createDataFrame(panda_sub_train)
df_sub_test=sqlCtx.createDataFrame(panda_sub_test)

In [135]:
#merge x feature files with subject id
merged_train=pd.merge(panda_sub_train,pandas_train_features, right_index=True, left_index=True)
merged_test=pd.merge(panda_sub_test,pandas_test_features, right_index=True, left_index=True)
df_merged_train_features=sqlCtx.createDataFrame(merged_train)
df_merged_ttest_features=sqlCtx.createDataFrame(merged_test)

In [139]:
#check for missing values
from pyspark.sql.functions import isnan, when, count, col
df_merged_train_features.select([count(when(isnan(c), c)).alias(c) for c in df_merged_train_features.columns]).show()


+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---

In [143]:
#snippit of a shorter missing values count
df_merged_train_features.select([count(when(isnan(c), c)).alias(c) for c in ['id','0','1','2','3','4']]).show()

+---+---+---+---+---+---+
| id|  0|  1|  2|  3|  4|
+---+---+---+---+---+---+
|  0|  0|  0|  0|  0|  0|
+---+---+---+---+---+---+



In [145]:
df_merged_train_features.select('0','1','2','3','4').describe().show()

+-------+-------------------+--------------------+--------------------+-------------------+-------------------+
|summary|                  0|                   1|                   2|                  3|                  4|
+-------+-------------------+--------------------+--------------------+-------------------+-------------------+
|  count|               7352|                7352|                7352|               7352|               7352|
|   mean| 0.2744881249635204|-0.01769542735779...|-0.10914102015124041|-0.6054384511368359|-0.5109375609608321|
| stddev|0.07026133266614942|0.040810524573263465|  0.0566351880847293| 0.4487343844696271| 0.5026446547401445|
|    min|               -1.0|                -1.0|                -1.0|               -1.0|        -0.99987292|
|    max|                1.0|                 1.0|                 1.0|                1.0|         0.91623796|
+-------+-------------------+--------------------+--------------------+-------------------+-------------

# Look at class imbalance in our data