# Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import feature_selection
import pandas.tseries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.metrics import r2_score
from sklearn import svm

  from pandas.core import datetools


In [6]:
df_cap = pd.read_csv('T_UWWTPS.csv')
df_cap2 = df_cap.drop(df_cap.index[4332]) # remove abnormal point
df_temp = pd.read_csv('location_temperature.csv')

df_cap3 = df_cap2[['uwwLatitude', 'uwwLongitude']]
df_cap3.shape

(27082, 2)

In [9]:
df_cleaned = pd.DataFrame(data = {'LoadEntering': df_cap2['uwwLoadEnteringUWWTP'],'Capacity': df_cap2['uwwCapacity'], 
                          'T': df_temp['temperature'],'NRemoval':df_cap2['uwwNRemoval'],'PRemoval':df_cap2['uwwPRemoval'],
                                  'Longitude': df_cap3['uwwLongitude'], 'Latitude':df_cap3['uwwLatitude']})
df_cleaned.head()

Unnamed: 0,Capacity,Latitude,LoadEntering,Longitude,NRemoval,PRemoval,T
0,8000.0,47.77919,6346.0,17.05039,True,True,2.21
1,65000.0,47.59491,10032.0,16.64161,True,True,2.88
2,42000.0,47.83195,38697.0,16.54416,True,True,3.96
3,7250.0,47.82736,3422.0,16.92429,True,True,2.93
4,5000.0,48.01208,1351.0,16.99434,True,True,4.06


In [10]:
df_no_missing = df_cleaned.dropna()
df_no_zeros = df_no_missing[df_no_missing.LoadEntering != 0]
df = df_no_zeros[df_no_zeros.Capacity != 0]
df.head()

Unnamed: 0,Capacity,Latitude,LoadEntering,Longitude,NRemoval,PRemoval,T
0,8000.0,47.77919,6346.0,17.05039,True,True,2.21
1,65000.0,47.59491,10032.0,16.64161,True,True,2.88
2,42000.0,47.83195,38697.0,16.54416,True,True,3.96
3,7250.0,47.82736,3422.0,16.92429,True,True,2.93
4,5000.0,48.01208,1351.0,16.99434,True,True,4.06


In [2]:
def split_train_test(df):
    x = df[['LoadEntering', 'Longitude', 'Latitude']]
    y = df['Capacity']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
    df_train = pd.DataFrame(data = {'Capacity':y_train, 'LoadEntering':x_train['LoadEntering'], 
                                'Longitude':x_train['Longitude'], 'Latitude':x_train['Latitude']})
    return df_train, x_test, y_test

def linear_regr(df_train, x_test):
    model = smf.ols("Capacity ~ LoadEntering + Longitude * Latitude", df_train)
    result = model.fit()
    y_predict = result.predict(x_test)
    return result, y_predict


* F_statistics is greater than 1 and p value is 0, which is a strong evidence to show that there is a relationship between LoadEntering and Capacity. $R^2$ is 0.867, which means 86.7% of training data can be interpreted in the regression line. 

In [None]:
# studentized residual and leverage
influence = result.get_influence()
stu_residual = influence.resid_studentized_external
(cooks, p) = influence.cooks_distance
(dffits, p) = influence.dffits
leverage = influence.hat_matrix_diag
# high leverage points
fig, axes = plt.subplots(1, 2, figsize = (10, 4), dpi = 700)
axes[0].scatter(leverage, stu_residual, marker = '.')
axes[0].set_title('Studentized Residual vs. Leverage')
axes[0].set_xlabel('Leverage')
axes[0].set_ylabel('Studentized Residual')
axes[0].axhline(y = 0, ls = '--', linewidth = 0.7, c = 'black')
# outliers
axes[1].scatter(result.predict(), stu_residual, marker='.')
axes[1].set_title('Studentized Residual vs. Fitted Values')
axes[1].set_xlabel('Fitted Values')
axes[1].set_ylabel('Studentized Residual')
axes[1].axhline(y = 0, ls = '--', linewidth = 0.7, c = 'black')

In [None]:
# residual between y_train and y_predict
# remove outliers (exclude [-10, 10])
cp_index = []
for i in range(len(stu_residual)):
    if abs(stu_residual[i]) <= 10:
        cp_index.append(i)

nw_x_train = []
nw_y_train = []
for i in cp_index:
    nw_x_train.append(x_train[i])
    nw_y_train.append(y_train[i])

In [None]:
# using new list that has remove some outliers to get a better linear regression
nw_result = sm.OLS(nw_y_train, nw_x_train).fit()
print(nw_result.summary())

In [None]:
plt.subplots(figsize = (8, 5), dpi = 800)
plt.scatter(x_train, y_train, c='tan', marker = '.', label = 'training data')
plt.scatter(x_test, y_test, c='r', marker = '.', label = 'testing data')
plt.plot(nw_x_train, nw_result.predict(), label = 'Fit with outliers removed')
plt.plot(x_train, result.predict(), c='tan', label = 'Fit without outliers removed')
plt.legend()
plt.xlabel('Load Entering')
plt.ylabel('Capacity')

# Classification

In [37]:
customer = pd.DataFrame(data = {'Capacity': [6200.0], 'Longitude': [17.0], 'Latitude': [47.0]})

In [38]:
def knn(df, customer):
    min_r = np.sqrt((customer.Capacity[0] - df.Capacity[0])**2 + (customer.Longitude[0] - df.Longitude[0])**2 + (customer.Latitude[0] - df.Latitude[0])**2)
    min_index = 0
    for i in range(1, len(df) - 1):
        r_2 = np.sqrt((customer.Capacity[0] - df.Capacity[i])**2 + (customer.Longitude[0] - df.Longitude[i])**2 + (customer.Latitude[0] - df.Latitude[i])**2)
        if r_2 <= min_r:
            min_r = r_2
            min_index = i
    nearest = df.iloc[min_index]
    return min_r, min_index, nearest

In [39]:
min_r, min_index, nearest = knn(df, customer)

KeyError: 38

In [11]:
dfy = pd.DataFrame(data = {'A': [1, 2, 3, 4, 5, 6], 'B':[6, 5, 4, 3, 2, 1], 'C': [2, 4, 6, 8, 9, 4]})
dfy

Unnamed: 0,A,B,C
0,1,6,2
1,2,5,4
2,3,4,6
3,4,3,8
4,5,2,9
5,6,1,4


In [34]:
c = pd.DataFrame('')
min_1 = 0
for i in range(1, len(dfy) - 1):
    

range(1, 5)