In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np # importa NumPy com alias np
import pandas as pd # importa pandas com alias pd
import matplotlib.pyplot as plt # importa modulo matplotlib.pyplot com alias pd

# Importing cufflinks and plotly for interactive visualization
import cufflinks as cf
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from my_utils import utils
from sklearn.linear_model import LinearRegression

In [3]:
# lib versions
print(np.__name__, ":", np.__version__)
print(pd.__name__, ":", pd.__version__)
print(cf.__name__, ":", cf.__version__)
print(plotly.__name__, ":", plotly.__version__)


numpy : 1.16.3
pandas : 0.24.2
cufflinks : 0.15
plotly : 3.8.1


In [4]:
# Plotly configs
init_notebook_mode(connected=True)
cf.go_offline()

In [133]:
df = pd.read_csv("dataset/TrainExer13.txt", sep="\t")

In [137]:
df

Unnamed: 0,Game,Year,Winning time men
0,1,1948,10.3
1,2,1952,10.4
2,3,1956,10.5
3,4,1960,10.2
4,5,1964,10.0
5,6,1968,9.95
6,7,1972,10.14
7,8,1976,10.06
8,9,1980,10.25
9,10,1984,9.99


## Scatter Plot of Game x Winning Time

In [138]:
df.iplot(kind='scatter', mode='markers', x='Game', y='Winning time men', filename='cufflinks/Game-Winning Time')

## Simple Linear Regression
$W_i = \alpha + \beta G_i + \epsilon_i$ 

In [139]:
y = df["Winning time men"]
x = df["Game"]
n = df.shape[0]

In [140]:
y_mean = np.mean(y)
x_mean = np.mean(x)
print("W mean: {}".format(y_mean))
print("G mean: {}".format(x_mean))
b = np.sum((x - x_mean) * (y - y_mean))/np.sum(np.square(x - x_mean))
a = y_mean - b*x_mean
print("a: {}".format(a))
print("b: {}".format(b))



W mean: 10.082
G mean: 8.0
a: 10.386000000000001
b: -0.03800000000000003


In [141]:
y_hat = a + b*x
e = y-y_hat

In [142]:
s2 = (1/(n-2))*np.sum(np.square(e))
s = np.sqrt(s2)
s

0.12282570515227642

In [143]:
r2 = 1 - (np.sum(np.square(e)) / (np.sum(np.square(y-y_mean))))
r2

0.6733728599027364

a)

$R^2 = 0.673$

$s = 0.122$

In [144]:
def color_above_dot_one_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for > 0.1
    strings, black otherwise.
    """
    color = 'red' if abs(val) > 0.1 else 'black'
    return 'color: %s' % color


b)

In [145]:
result_df = pd.DataFrame({
    'Game': x,
    'Winning Time Men': y,
    'Winning Time Men Predicted': y_hat,
    'Error': e
}).style.applymap(color_above_dot_one_red, subset=pd.IndexSlice[:, ['Error']])
result_df

Unnamed: 0,Game,Winning Time Men,Winning Time Men Predicted,Error
0,1,10.3,10.348,-0.048
1,2,10.4,10.31,0.09
2,3,10.5,10.272,0.228
3,4,10.2,10.234,-0.034
4,5,10.0,10.196,-0.196
5,6,9.95,10.158,-0.208
6,7,10.14,10.12,0.02
7,8,10.06,10.082,-0.022
8,9,10.25,10.044,0.206
9,10,9.99,10.006,-0.016


c)

In [151]:
df_new = pd.DataFrame({
    df.columns[0]: [16, 17, 18],
    df.columns[1]: [2008, 2012, 2016],
    df.columns[2]: [9.69, 9.63, 9.81]
})
df_new

Unnamed: 0,Game,Year,Winning time men
0,16,2008,9.69
1,17,2012,9.63
2,18,2016,9.81


In [157]:
df = df.append(df_new, ignore_index=True )


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [159]:
df["Winning Time Men Predicted"] = df.apply(lambda x: a + b*x["Game"], axis=1 )
df["Error"] = df.apply(lambda x: x["Winning time men"] - x["Winning Time Men Predicted"], axis=1)
df

Unnamed: 0,Game,Winning Time Men Predicted,Winning time men,Year,Error
0,1,10.348,10.3,1948,-0.048
1,2,10.31,10.4,1952,0.09
2,3,10.272,10.5,1956,0.228
3,4,10.234,10.2,1960,-0.034
4,5,10.196,10.0,1964,-0.196
5,6,10.158,9.95,1968,-0.208
6,7,10.12,10.14,1972,0.02
7,8,10.082,10.06,1976,-0.022
8,9,10.044,10.25,1980,0.206
9,10,10.006,9.99,1984,-0.016


In [171]:
# Create traces
interval = np.linspace(1, 19, 1000)
points = a + b*interval

trace0 = go.Scatter(
    x = df["Game"],
    y = df["Winning time men"],
    mode = 'markers',
    name = 'Real',
    marker = dict(
          color = 'rgb(255, 0, 0)',
          size = 12,
          line = dict(
            color = 'rgb(0, 0, 0)',
            width = 2
          )
        ),
        showlegend = True
)
trace1 = go.Scatter(
    x = df["Game"],
    y = df["Winning Time Men Predicted"],
    mode = 'markers',
    name = 'Predicted',
    marker = dict(
          color = 'rgb(0, 230, 20)',
          size = 12,
          line = dict(
            color = 'rgb(0, 0, 0)',
            width = 2
          )
        ),
        showlegend = True
)
trace2 = go.Scatter(
    x = interval,
    y = points,
    mode = 'lines',
    name = 'a + bx',
    marker = dict(
          color = 'rgb(0, 0, 255)',
          size = 5,
          line = dict(
            color = 'rgb(0, 0, 0)',
            width = 2
          )
        ),
        showlegend = False
)

data = [trace0, trace1, trace2]
iplot(data, filename='scatter-mode')