In [None]:
import numpy as np
import pandas as pd
import datetime as dt

pd.options.display.float_format='{:.2f}'.format

df = pd.read_csv('../data/000015', index_col= 'Date', names=['Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'Money', 'PE', 'PB'], parse_dates=True, header=None)
df['Return'] = df['Close'].pct_change() * 100
df['Year'] = [i.year for i in df.index]
df['Month'] = [i.month for i in df.index]
df['Day'] = [i.day for i in df.index]

# Summarizing and computing descriptive statistics

| Method           | Description                                                                                 |
| ---------------- | ------------------------------------------------------------------------------------------- |
| `count`          | Number of non-NA values                                                                     |
| `describe`       | Compute set of summary statistics for Series or each DataFrame column                       |
| `min, max`       | Compute minimum and maximum values                                                          |
| `argmin, argmax` | Compute index locations (integers) at which minimum or maximum value obtained, respectively |
| `idmin, idmax`   | Compute index values at which minimum or maximum value obtained, respectively               |
| `quantile`       | Compute sample quantile ranging from 0 to 1                                                 |
| `sum`            | Sum of values                                                                               |
| `mean`           | Mean of values                                                                              |
| `median`         | Arithmetic median (50% quantile) of values                                                  |
| `mad`            | Mean absolute deviation from mean value                                                     |
| `var`            | Sample variance of values                                                                   |
| `std`            | Sample standard deviation of values                                                         |
| `skew`           | Sample skewness (3rd moment) of values                                                      |
| `kurt`           | Sample kurtosis (4th moment) of values                                                      |
| `cumsum`         | Cumulative sum of values                                                                    |
| `cummin, cummax` | Cumulative minimum or maximum of values, respectively                                       |
| `cumprod`        | Cumulative product of values                                                                |
| `diff`           | Compute 1st arithmetic difference (useful for time series)                                  |
| `pct_change`     | Compute percent changes                                                                     |

## Quartile

In [None]:
from scipy.stats.mstats import mquantiles

sorted_close = np.sort(df['Close'])
mquantiles(sorted_close)

[np.percentile(sorted_close, perc) for perc in [0, 25,50,75, 100]]

## Correlation and covariance

## Unique values, value counts, and membership

# Hypothesis testing - the null and alternative hypotheses

## The null and alternative hypotheses

### The alpha and p-values

### Type I and Type II errors

## Statistical hypothesis tests

### The z-test

### The t-test

## Confidence intervals

## Correlation and linear regression

### Correlation

### Linear regression

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
%matplotlib inline

pd.options.display.float_format='{:.2f}'.format

df1 = pd.read_csv('../data/000015', index_col= 'Date', names=['Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'Money', 'PE', 'PB'], parse_dates=True, header=None)[-240:]
df1['Return'] = df1['Close'].pct_change() * 100
df2 = pd.read_csv('../data/000300', index_col= 'Date', names=['Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'Money', 'PE', 'PB'], parse_dates=True, header=None)[-240:]
df2['Return'] = df2['Close'].pct_change() * 100

plt.scatter(df1['Return'], df2['Return'], marker='o', edgecolor='b', facecolor='none', alpha=0.5)
plt.xlabel('000015')
plt.ylabel('000300')

slope, intercept = np.polyfit(df1['Return'][1:],df2['Return'][1:],1)
plt.plot(df1['Return'],df1['Return']*slope + intercept,'r')
plt.show()

df3 = pd.concat([df1['Return'][1:], df2['Return'][1:]], axis=1)
df3.columns = ['Return1', 'Return2']
results = sm.ols('Return1 ~ Return2', df3).fit()
results.summary()
