# Python Data Statistics with pandas
- Outside of Python this is rolling backup

## Set Up Backend 

In [None]:
# This must be done before any other imports for matplotlib
# This sets up ipython to use the inline interpeter

#%matplotlib inline
%matplotlib notebook
#%matplotlib qt

In [None]:
# The rest of the imports

from pandas import DataFrame
from pandas import Series
from numpy.random import randint
from numpy.random import randn
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import matplotlib as mpl

## Data Preparation

### Get Raw Data

In [None]:
stock = 'GOOGL'
df = pd.read_csv('./Source_files/' + stock + '_stock.csv')
print(df.head(10))
print(df.count())

### Convert Index to Timestamp and Drop `date1` Column

In [None]:
df_date_index = df.rename(index = pd.to_datetime(df['date'], 
                          format = '%Y-%m-%d')).drop('date', axis = 1)
print(df_date_index.head())
print(type(df_date_index.index))

### Convert to Date Period with Period Being One Day
- Assume that the mean of the one day is the value of the data
- Data is not collected for weekend and would show up as `np.nan`, except use method `.fillna(method='ffill')` (forward fill) to fill Saturday and Sunday with value of Friday

In [None]:
df_day_all_days = df_date_index.resample('D', kind = 'period').mean().fillna(method = 'ffill')

print(df_day_all_days.head())
print(df_day_all_days.count())
print(type(df_day_all_days.index))

### Creating a Function

In [None]:
def convert_to_day_period(source_file, date_column_name, time_period):
    import pandas as pd
    
    df = pd.read_csv(source_file)
    
    df_date_index = df.rename(index = pd.to_datetime(df[date_column_name], 
                            format = '%Y-%m-%d')).drop(date_column_name, axis = 1)
    
    df_day_all_days = df_date_index.resample(time_period, kind = 'period').mean().fillna(method = 'ffill')
    
    return df_day_all_days

#### Testing Function

In [None]:
df_google_year = convert_to_day_period('./Source_files/GOOGLE_stock.csv', 'date', 'Y')

In [None]:
print(df_google_year.head())
print(df_google_year.count())

## Percent Change
- Compute the percent change over a given number of periods

In [None]:
df_percent_change = df_day_all_days.pct_change(periods = 30)
print(df_percent_change.count())
print(df_percent_change.iloc[25:35, :])

### Plotting the Data

In [None]:
ax = df_percent_change['volume'].plot(kind = 'line',
                      figsize = (8, 4),
                      title = 'Percent_change with period of 30 days', 
                      grid = True,
                      legend = True)
ax.set_xlabel('Date')
ax.set_ylabel('Percent Change')

### Saving the Graphic

In [None]:
plt.savefig('Destination_files/Volume_30_percent_change.png')

## Correlation
- Default is Pearson, Kendall and Spearman are available

### Between Series in Same DataFrame

In [None]:
df_day_all_days['high'].corr(df_day_all_days['low'], method = 'pearson')

In [None]:
df_day_all_days['high'].corr(df_day_all_days['low'], method = 'spearman')

In [None]:
df_day_all_days['high'].corr(df_day_all_days['low'], method = 'kendall')

### Pairwise Correlation in DataFrame

In [None]:
df_day_all_days.corr(method = 'pearson')

## Correlation Between Different DataFrames

In [None]:
df_NKE_day = convert_to_day_period('./Source_files/NKE_stock.csv', 'date', 'D')
df_NKE_day = df_NKE_day.drop(['adj_close'], axis = 1)

In [None]:
print(df_NKE_day.head())

In [None]:
df_day_all_days.corrwith(df_NKE_day, axis= 0, method = 'pearson')

## Other Functions

- Data ranking
- Covariance
  - Be sure you understand the math for pandas version of Covariance
- Apply
  - Allows you to change a column or DataFrame elements in arbitrary ways

# End of Notebook