# In class exercises - Intro to Pandas Series and DataFrames

## Import libs

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas import DataFrame, read_csv


## First import 'response_time_data.csv' data file
* Contains RTs from 800 trials of a simple detection task from each of 20 subjects
* Organizing into a DataFrame and then saved out in csv format
* The index (row) and column labels are encoded in the csv file, so you'll need to read those in explcitly
* Make sure to have a look at the DataFrame - use the df.head() function

In [None]:
df = pd.read_csv("response_time_data.csv", index_col = 0, header = 0)
df.head()

## Now have a look at the data using built in Padas functionality
* Check out the max/min of each row, standard deviation, percentiles, etc.

In [None]:
df.describe()

## Are there missing values (NaNs) in the data?

In [None]:
if np.isnan(df).any:
    print("Yes")


## What about outliers? Lets define outliers here as > 2 * std away from the mean for each subject
* After you've found the outliers for each subject, replace those values with a np.nan (NaN)

In [None]:
## Find mean of each subject
## print(df.mean(axis=0))
## Find STD of each subject
## print(df.std(axis=0))
## Find "outliers" i.e. data points >2 * std from the mean
outlier_min = df.mean() + 2*df.std()
print(outlier_min)

outliers = df>outlier_min
df[outliers]=np.nan

display(df)

## df.replace(to_replace=df>outlier_min, value = np.nan)
## df.head()

## After you've found the outliers and replaced with NaNs for each subject, check out this function:
[pandas.DataFrame.interpolate](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.interpolate.html#pandas.DataFrame.interpolate)

* Use this function to interpolate the missing values for each subject (do not interpolate across subjects!)
* Just use linear interpolation...

In [None]:
df.interpolate()

## You can explore the "Missing Values" page for Pandas to figure out other ways of filling in missing values and outliers

[page is here](https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data)

* Figure out how to replace the outliers with the mean of each subject

In [None]:
df.fillna(df.mean(axis=0), inplace=True)
print(df)

## Use the Pandas.DataFrame.Sample function to generate bootstrapped confidence intervals for the data from subject 11

[see this page for Samples](https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.DataFrame.sample.html)


* Resample Sub11's data with replacement, each time pulling N samples (800 in this case)
* Generate a distribution of means across all resamples
* Compute 95% confidence intervals using:

[this page for quantiles](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.quantile.html)

In [None]:
s11 = df['Sub11']

sample_num = len(df)
num_bootstraps = 5000

foo = [s11.sample(n=sample_num, replace=True).mean() for i in range(num_bootstraps)]

plt.hist(foo)

In [None]:
# 95% CIs based on percentiles 
CIs = np.percentile(foo, [2.5, 97.5])
print(CIs[0])
print(CIs[1])

# histogram it
plt.hist(foo, color='r', alpha=1, bins=30)
plt.axvline(CIs[0], color='g', linewidth=1)
plt.axvline(CIs[1], color='g', linewidth=1)
plt.xlabel('Sample mean')
plt.ylabel('Count')
plt.show()