**Author:** [Piyush Tada](https://www.linkedin.com/in/piyushtada/)  
**Python version:**  x.1

In [1]:
# Loading important libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_lift_curve
import nbconvert
import itertools

# Data Preparation

In [2]:
def getting_data_ready():
    
    """This fuction load all the data and split it.
    To make life simple of loading and checking performance.
    """
    
    # for loading all the data
    df = pd.read_csv('./data/scaled_datatrainingcopy.csv')
    df_test1 = pd.read_csv('./data/scaled_datatestcopy.csv')
    df_test2 = pd.read_csv('./data/scaled_datatest2copy.csv')

    class_name = "Occupancy"

    attributes = [col for col in df.columns if col != class_name]
    X = df[attributes].values # this is used so that you can get np value
    y = df[class_name]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

    attributes = [col for col in df.columns if col != class_name]
    X_test1 = df_test1[attributes].values
    y_test1 = df_test1[class_name]


    attributes = [col for col in df.columns if col != class_name]
    X_test2 = df_test2[attributes].values
    y_test2 = df_test2[class_name]
    
    
    return X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2

In [3]:
X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2 = getting_data_ready()

# Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error #this are need to get the results

In [5]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
print('Coefficients: \n', reg.coef_)
print('Intercept: \n', reg.intercept_)

Coefficients: 
 [-0.12317009 -0.16808266  0.34852348  0.08131454  0.15719227  0.1239484 ]
Intercept: 
 0.13205106384676116


In [7]:
y_pred = reg.predict(X_test)
print('R2: %.3f' % r2_score(y_test, y_pred))
print('MSE: %.3f' % mean_squared_error(y_test, y_pred))
print('MAE: %.3f' % mean_absolute_error(y_test, y_pred))

R2: 0.891
MSE: 0.018
MAE: 0.082


# R2:
### r-squared
The close to 1 it is the better then it says that the value us used for prediction is moving with the final value that you want.

https://www.investopedia.com/terms/r/r-squared.asp

# MSE: 
### mean squared error (MSE)
It calulate the distace of value from the mean:

An MSE of zero, meaning that the estimator <b> Z </b> predicts observations of the parameter <b> X </b>  with perfect accuracy, is the ideal, but is typically not possible.

https://en.wikipedia.org/wiki/Mean_squared_error

# MAE:
### mean absolute error (MAE)

It's basically the absolute error |x-x| so lower the value the better

In [8]:
# df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/preprocessdatatrainingcopy.csv')

#loading scaled data
df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv')

FileNotFoundError: [Errno 2] File /Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv does not exist: '/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv'

In [None]:
df.head()

Making this code for diffent tryes

In [None]:
to_use = 'Humidity' #['Temperature', 'Humidity', 'Light', 
to_find = 'Light'   #'CO2', 'HumidityRatio']

In [None]:
reg = LinearRegression() 
reg.fit(df[[to_use]], df[to_find])

In [None]:
print(len(df[[to_use]]), len(df[to_find]))

In [None]:
y_pred = reg.predict(df[[to_use]])
r = r2_score(df[to_find], y_pred)
print(round(r,ndigits=3))
print('R2: %.3f' % r2_score(df[to_find], y_pred))
print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))

In [None]:
y_pred = reg.predict(df[[to_use]])
plt.figure(figsize=(10,5))
plt.style.context('ggplot')
plt.xlabel(to_use)
plt.ylabel(to_find)
plt.scatter(df[to_use], df[to_find])
plt.plot(df[[to_use]], y_pred, color = 'black')

In [None]:
len(y_pred)

In [None]:
options = ['Temperature', 'Humidity', 'Light','CO2', 'HumidityRatio']

I got itertools from here https://stackoverflow.com/questions/464864/how-to-get-all-possible-combinations-of-a-list-s-elements

In [None]:
import itertools

for first , second in itertools.combinations(options,2):
    print(first, second)

In [None]:
## Linear Regression in 2 dimensions

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error #this are need to get the results

reg = [("LinearRegression",LinearRegression()), ("Lasso", Lasso()), ("Ridge",Ridge())]

df_report = pd.DataFrame(columns=["Model",'R2', 'MSE', 'MAE', 'Coefficients','Intercept'])


#y_test = X_test.T[0].reshape(-1, 1)
#y_train = X_train.T[0].reshape(-1, 1)

for name, reg in reg:
    
    print("========Results of {}============".format(name))
    reg.fit(X_train.T[0].reshape(-1, 1), y_train)
    y_pred = reg.predict(X_test.T[0].reshape(-1, 1))
    
    print('R2: %.3f' % r2_score(y_test, y_pred))
    print('MSE: %.3f' % mean_squared_error(y_test, y_pred))
    print('MAE: %.3f' % mean_absolute_error(y_test, y_pred))
    
#     #this is the parameters that define how the line will look
#     print('Coefficients: ', reg.coef_)
#     print('Intercept: ', reg.intercept_)
    
#     plt.scatter(X_test.T[0], y_test,  color='black')
#     plt.plot(X_test.T[0], y_pred, color='blue', linewidth=3)
#     df_report["Model"] = name
#     df_report['R2'] = r2_score(y_test, y_pred)
#     df_report['MSE'] = mean_squared_error(y_test, y_pred)
#     df_report['MAE'] = mean_absolute_error(y_test, y_pred)
# #     df_report['Coefficients'] = reg.coef_
# #     df_report['Intercept'] = reg.intercept_
#     df_report = pd.concat([df_report, df_report], axis=0, ignore_index=True )
    
#     plt.show()

In [None]:
df_report

In [None]:
# for saving charts

file_name = "/Users/piyush2017/Downloads/" + name_of_model + "_ROC_curve_" + num + ".png"
       plt.savefig(file_name)

In [None]:
name_of_model = "Linear Regression"




# X_train.T[0].reshape(-1, 1)## Linear Regression in 2 dimensions

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error #this are need to get the results

reg = [("LinearRegression",LinearRegression()), ("Lasso", Lasso()), ("Ridge",Ridge())]

df_report = pd.DataFrame(columns=["Model",'R2', 'MSE', 'MAE', 'Coefficients','Intercept'])


# y_test = X_test.T[1].reshape(-1, 1)
# y_train = X_train.T[1]

for name, reg in reg:
    
    print("========Results of {}============".format(name))
    reg.fit(X_train.T[0].reshape(-1, 1), y_train)
    y_pred = reg.predict(X_test.T[0].reshape(-1, 1))
    
    print('R2: %.3f' % r2_score(y_test, y_pred))
    print('MSE: %.3f' % mean_squared_error(y_test, y_pred))
    print('MAE: %.3f' % mean_absolute_error(y_test, y_pred))
    
    #this is the parameters that define how the line will look
    print('Coefficients: ', reg.coef_)
    print('Intercept: ', reg.intercept_)
    
    plt.scatter(X_test.T[0], y_test,  color='black')
    plt.plot(X_test.T[0], y_pred, color='blue', linewidth=3)
    df_report["Model"] = name
    df_report['R2'] = r2_score(y_test, y_pred)
    df_report['MSE'] = mean_squared_error(y_test, y_pred)
    df_report['MAE'] = mean_absolute_error(y_test, y_pred)
    df_report['Coefficients'] = reg.coef_
    df_report['Intercept'] = reg.intercept_
    df_report = pd.concat([df_report, df_report], axis=1)
    file_name = "/Users/piyush2017/Downloads/" + name_of_model + "_chart_" + name + ".png"

    plt.savefig(file_name)
    plt.show()

# Lasso

In [None]:
reg = Lasso()
reg.fit(X_train.T[0].reshape(-1, 1), y_train)
print('Coefficients: \n', reg.coef_)
print('Intercept: \n', reg.intercept_)

In [None]:
y_pred = reg.predict(X_test.T[0].reshape(-1, 1))
print('R2: %.3f' % r2_score(y_test, y_pred))
print('MSE: %.3f' % mean_squared_error(y_test, y_pred))
print('MAE: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
plt.scatter(X_test.T[0], y_test,  color='black')
plt.plot(X_test.T[0], y_pred, color='blue', linewidth=3)
plt.show()

# Ridge

In [None]:
reg = Ridge()
reg.fit(X_train.T[0].reshape(-1, 1), y_train)
print('Coefficients: \n', reg.coef_)
print('Intercept: \n', reg.intercept_)

In [None]:
y_pred = reg.predict(X_test.T[0].reshape(-1, 1))
print('R2: %.3f' % r2_score(y_test, y_pred))
print('MSE: %.3f' % mean_squared_error(y_test, y_pred))
print('MAE: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
plt.scatter(X_test.T[0], y_test,  color='black')
plt.plot(X_test.T[0], y_pred, color='blue', linewidth=3)
plt.show()

# Logistic Regression with preprocced data

In [None]:
df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/preprocessdatatrainingcopy.csv')

In [None]:
selected_columns = ['CO2']
class_name = "Occupancy"
attributes = [col for col in df.columns if col != class_name]
X = df[selected_columns].values
y = df[class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)


In [None]:
plt.scatter(X_train, y_train)
plt.xlabel(selected_columns[0], fontsize=16)
plt.ylabel('Occupancy', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

In [None]:
from scipy.special import expit

In [None]:
loss = expit(sorted(X_test) * clf.coef_ + clf.intercept_).ravel()
plt.plot(sorted(X_test), loss, color='red', linewidth=3)
plt.scatter(X_train, y_train)
plt.xlabel(selected_columns[0], fontsize=16)
plt.ylabel('Occupancy', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
plt.show()

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [None]:
plt.scatter(X_train, y_train)

loss = expit(sorted(X_test) * clf.coef_ + clf.intercept_).ravel()
plt.plot(sorted(X_test), loss, color='orange', linewidth=3)

plt.plot(sorted(X_test), reg.coef_ * sorted(X_test) + reg.intercept_, color='red', linewidth=3)


plt.xlabel(selected_columns[0], fontsize=16)
plt.ylabel('Occupancy', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
plt.show()

# Logistic Regression

https://realpython.com/logistic-regression-python/

Here I am doing exprement first with only CO2 and testing the result and then later with CO2 and Light

# Theory

Logistic regression is a fundamental classification technique. It belongs to the group of linear classifiers

In [None]:
def getting_data_ready():

    # for loading all the data
    df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv')
    df_test1 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatestcopy.csv')
    df_test2 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatest2copy.csv')

    class_name = "Occupancy"
    selected_columns = ['CO2']
    
    #attributes = [col for col in df.columns if col != class_name]
    X = df[selected_columns].values
    y = df[class_name]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

    #attributes = [col for col in df.columns if col != class_name]
    X_test1 = df_test1[selected_columns].values
    y_test1 = df_test1[class_name]


    #attributes = [col for col in df.columns if col != class_name]
    X_test2 = df_test2[selected_columns].values
    y_test2 = df_test2[class_name]
    
    
    return X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2

In [None]:
X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2 = getting_data_ready()

# this is list to be used in testing automation

tests = [("Training", X_test,y_test),("Test_1", X_test1,y_test1),
         ("Test_2", X_test2, y_test2)]




In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

In [None]:
def show_report(tests):
    for num, test, results in tests:

        y_pred = clf.predict(test)
        print('\n ===============Results for {} ================== \n'.format(num))
        print('Accuracy %s' % accuracy_score(results, y_pred))
        print('F1-score %s' % f1_score(results, y_pred, average=None))
        print(classification_report(results, y_pred))
        y_score = clf.predict_proba(test)
        plot_roc(results, y_score)
        plot_lift_curve(results, y_score)
        
        plt.show()

In [None]:
show_report(tests)

In [None]:
df_report = pd.DataFrame()

for num, test, results in tests:
    
    y_pred = clf.predict(test)
    results = pd.DataFrame(classification_report(results, y_pred, output_dict=True))
    print(results.T)
    df_report = pd.concat([df_report, results.T], axis=1 )

In [None]:
path_to_save_results = "/Users/piyush2017/Downloads/"

In [None]:
# this code will delete unwanted things

try:
    df_report.drop(['support'], axis=1, inplace=True)
    df_report.drop(['macro avg','weighted avg'], axis=0,inplace=True)

except:
    pass 

file_name = path_to_save_results + "res.xlsx"

df_report.to_excel(file_name)
df_report.head(5)

In [None]:
def get_every_thing(to_use, to_find):
    reg = LinearRegression() 
    reg.fit(df[[to_use]], df[to_find])
    y_pred = reg.predict(df[[to_use]])
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(df[to_use], df[to_find])
    plt.plot(df[[to_use]], y_pred, color = 'black')
    plt.show()
    
    # to get the numerical attribute

    print('R2: %.3f' % r2_score(df[to_find], y_pred))
    print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
    print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))
    
    # to get two remaing values
    print('Coefficients: ', reg.coef_)
    print('Intercept: ', reg.intercept_)

In [None]:
for first , second in itertools.combinations(options,2):
    print(first, second)
    get_every_thing(first, second)

In [None]:
### trying to do all the three 

In [None]:
def getting_data_ready():

    # for loading all the data
    df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv')
    df_test1 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatestcopy.csv')
    df_test2 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatest2copy.csv')

    class_name = "Occupancy"
    selected_columns = ['CO2','Light']
    
    #attributes = [col for col in df.columns if col != class_name]
    X = df[selected_columns].values
    y = df[class_name]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

    #attributes = [col for col in df.columns if col != class_name]
    X_test1 = df_test1[selected_columns].values
    y_test1 = df_test1[class_name]


    #attributes = [col for col in df.columns if col != class_name]
    X_test2 = df_test2[selected_columns].values
    y_test2 = df_test2[class_name]
    
    
    return X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2

In [None]:
X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2 = getting_data_ready()

# this is list to be used in testing automation

tests = [("Training", X_test,y_test),("Test_1", X_test1,y_test1),
         ("Test_2", X_test2, y_test2)]




In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

In [None]:
name_of_model = "LogisticRegression"

def show_report_chart_lift(tests):
    for num, test, results in tests:

        y_pred = clf.predict(test)
        y_score = clf.predict_proba(test)
        plot_lift_curve(results, y_score)
        file_name = "/Users/piyush2017/Downloads/" + name_of_model + "_lift_curve_" + num + ".png"
        plt.savefig(file_name)
        plt.show()
        
def show_report_chart_roc(tests):
    for num, test, results in tests:

        y_pred = clf.predict(test)
        y_score = clf.predict_proba(test)
        plot_roc(results, y_score)
        file_name = "/Users/piyush2017/Downloads/" + name_of_model + "_ROC_curve_" + num + ".png"
        plt.savefig(file_name)
        plt.show()

In [None]:
show_report_chart_lift(tests)
show_report_chart_roc(tests)

In [None]:
df_report = pd.DataFrame()

for num, test, results in tests:
    
    y_pred = clf.predict(test)
    results = pd.DataFrame(classification_report(results, y_pred, output_dict=True))
    print(results.T)
    df_report = pd.concat([df_report, results.T], axis=1 )

In [None]:
# this code will delete unwanted things

df_report.drop(['support'], axis=1, inplace=True)
df_report.drop(['macro avg','weighted avg'], axis=0,inplace=True)

df_report.to_excel('/Users/piyush2017/Downloads/res.xlsx')
df_report.head(5)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None, normalize=None)

print(cm)

sns.heatmap(cm, annot=True,fmt="d",cmap='Oranges')

## Trying with all the atributes

In [None]:
def getting_data_ready():

    # for loading all the data
    df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv')
    df_test1 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatestcopy.csv')
    df_test2 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatest2copy.csv')

    class_name = "Occupancy"
    selected_columns = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']
    
    #attributes = [col for col in df.columns if col != class_name]
    X = df[selected_columns]
    y = df[class_name]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

    #attributes = [col for col in df.columns if col != class_name]
    X_test1 = df_test1[selected_columns].values
    y_test1 = df_test1[class_name]


    #attributes = [col for col in df.columns if col != class_name]
    X_test2 = df_test2[selected_columns].values
    y_test2 = df_test2[class_name]
    
    
    return X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2

In [None]:
X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2 = getting_data_ready()

# this is list to be used in testing automation

tests = [("Training", X_test,y_test),("Test_1", X_test1,y_test1),
         ("Test_2", X_test2, y_test2)]




In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

In [None]:
def show_report(tests):
    for num, test, results in tests:

        y_pred = clf.predict(test)
        print('\n ===============Results for {} ================== \n'.format(num))
        print('Accuracy %s' % accuracy_score(results, y_pred))
        print('F1-score %s' % f1_score(results, y_pred, average=None))
        print(classification_report(results, y_pred))
        y_score = clf.predict_proba(test)
        plot_roc(results, y_score)
        plot_lift_curve(results, y_score)
        
        plt.show()

In [None]:
show_report(tests)

In [None]:
df_report = pd.DataFrame()

for num, test, results in tests:
    
    y_pred = clf.predict(test)
    results = pd.DataFrame(classification_report(results, y_pred, output_dict=True))
    print(results.T)
    df_report = pd.concat([df_report, results.T], axis=1 )

In [None]:
path_to_save_results = "/Users/piyush2017/Desktop/"

In [None]:
# this code will delete unwanted things

try:
    df_report.drop(['support'], axis=1, inplace=True)
    df_report.drop(['macro avg','weighted avg'], axis=0,inplace=True)

except:
    pass 

file_name = path_to_save_results + "res.xlsx"

df_report.to_excel(file_name)
df_report.head(5)

In [None]:
name_of_model = "LogisticRegression"

def show_report_chart_lift(tests):
    for num, test, results in tests:

        y_pred = clf.predict(test)
        y_score = clf.predict_proba(test)
        plot_lift_curve(results, y_score)
        file_name = "/Users/piyush2017/Downloads/" + name_of_model + "_lift_curve_" + num + ".png"
        plt.savefig(file_name)
        plt.show()
        
def show_report_chart_roc(tests):
    for num, test, results in tests:

        y_pred = clf.predict(test)
        y_score = clf.predict_proba(test)
        plot_roc(results, y_score)
        file_name = "/Users/piyush2017/Downloads/" + name_of_model + "_ROC_curve_" + num + ".png"
        plt.savefig(file_name)
        plt.show()

In [None]:
show_report_chart_lift(tests)
show_report_chart_roc(tests)

In [None]:
df_report = pd.DataFrame(columns=["Model",'R2', 'MSE', 'MAE', 'Coefficients','Intercept'])

In [None]:
df_report

In [None]:
def getting_data_ready():

    # for loading all the data
    df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv')
    df_test1 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatestcopy.csv')
    df_test2 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatest2copy.csv')

    class_name = "Occupancy"
    selected_columns = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'Week_day']
    
    #attributes = [col for col in df.columns if col != class_name]
    X = df[selected_columns]
    y = df[class_name]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

    #attributes = [col for col in df.columns if col != class_name]
    X_test1 = df_test1[selected_columns].values
    y_test1 = df_test1[class_name]


    #attributes = [col for col in df.columns if col != class_name]
    X_test2 = df_test2[selected_columns].values
    y_test2 = df_test2[class_name]
    
    
    return X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2

In [None]:
X_train, X_test, y_train, y_test, X_test1, y_test1, X_test2, y_test2 = getting_data_ready()

# this is list to be used in testing automation

tests = [("Training", X_test,y_test),("Test_1", X_test1,y_test1),
         ("Test_2", X_test2, y_test2)]




In [None]:
clf = reg = Ridge()
clf.fit(X_train, y_train)

In [None]:
#checking the magnitude of coefficients

predictors = X_train.columns

coef = pd.Series(clf.coef_,predictors).sort_values()

coef.plot(kind='bar', title='Modal Coefficients')



In [None]:
clf = reg = Lasso()
clf.fit(X_train, y_train)

In [None]:
#checking the magnitude of coefficients

predictors = X_train.columns

coef = pd.Series(clf.coef_,predictors).sort_values()

coef.plot(kind='bar', title='Modal Coefficients')




In [None]:
df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/data/scaled_datatrainingcopy.csv')

In [None]:
df.columns

In [None]:
col = list(df.columns)

In [None]:
col

In [None]:
for i in col:
    
    col2 = [x for x in col if x != i]
    for j in col2:
        print(i,j)

In [None]:
ij = 'Light'

In [None]:
print([if x is not ij for x in col]) 

In [None]:
print([x for x in col if x != 'Light'])

# trying to put results in a doc

In [None]:
# all this code is to start the document 

from docx import Document
from docx.shared import Inches

document = Document()

document.add_heading('Linear Regression', 0)

p = document.add_paragraph('Here we are trying to find what will be the result of trying diffent combinations of features. ')

# p.add_run('bold').bold = True
# p.add_run(' and some ')
# p.add_run('italic.').italic = True

In [None]:
def get_every_thing(to_use, to_find):
    reg = LinearRegression() 
    reg.fit(df[[to_use]], df[to_find])
    y_pred = reg.predict(df[[to_use]])
    plt.figure(figsize=(10,5))
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(df[to_use], df[to_find])
    plt.plot(df[[to_use]], y_pred, color = 'black')
    plt.savefig('monty-truth.png')
    plt.show()
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    
    document.add_paragraph(
    [str(('R2: %.3f' % r2_score(df[to_find], y_pred))),
    str(('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))),
    str(('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred)))],style='List Bullet'
)

#     document.add_page_break()
    
#     print('R2: %.3f' % r2_score(df[to_find], y_pred))
#     print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
#     print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))

In [None]:
for first , second in itertools.combinations(options,2):
    
    document.add_heading("{} {}".format(first, second), level=1)
#     print(first, second)

    get_every_thing(first, second)
    
document.save('/Users/piyush2017/Downloads/LinearRegression.docx')

# _______________Don't touch it._____________________O

# adding table values together

In [None]:
# all this code is to start the document 

from docx import Document
from docx.shared import Inches

document = Document()

document.add_heading('Linear Regression', 0)

p = document.add_paragraph('Here we are trying to find what will be the result of trying diffent combinations of features. ')

# p.add_run('bold').bold = True
# p.add_run(' and some ')
# p.add_run('italic.').italic = True

In [None]:
def get_every_thing(to_use, to_find):
    reg = LinearRegression() 
    reg.fit(df[[to_use]], df[to_find])
    y_pred = reg.predict(df[[to_use]])
    plt.figure(figsize=(10,5))
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(df[to_use], df[to_find])
    plt.plot(df[[to_use]], y_pred, color = 'black')
    plt.savefig('monty-truth.png')
    plt.show()
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    
    r2 = r2_score(df[to_find], y_pred)
    mse = mean_squared_error(df[to_find], y_pred)
    mae = mean_absolute_error(df[to_find], y_pred)
    
    document.add_paragraph(
    [str(('R2: %.3f' % r2)),
    str(('MSE: %.3f' % mse)),
    str(('MAE: %.3f' % mae))],style='List Bullet'
)

#     document.add_page_break()
    
#     print('R2: %.3f' % r2_score(df[to_find], y_pred))
#     print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
#     print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))

In [None]:
for first , second in itertools.combinations(options,2):
    
    document.add_heading("{} {}".format(first, second), level=1)
#     print(first, second)

    get_every_thing(first, second)

print(r2, mse, mae, end='\n')
    
document.save('/Users/piyush2017/Downloads/LinearRegression02.docx')

# ==============Don't touch it.================

In [None]:
theory = """
# R2:
### r-squared
The close to 1 it is the better then it says that the value us used for prediction is moving with the final value that you want.

https://www.investopedia.com/terms/r/r-squared.asp

# MSE: 
### mean squared error (MSE)
It calulate the distace of value from the mean:

An MSE of zero, meaning that the estimator <b> Z </b> predicts observations of the parameter <b> X </b>  with perfect accuracy, is the ideal, but is typically not possible.

https://en.wikipedia.org/wiki/Mean_squared_error

# MAE:
### mean absolute error (MAE)

It's basically the absolute error |x-x| so lower the value the better """

In [None]:
# all this code is to start the document 

from docx import Document
from docx.shared import Inches

document = Document()

document.add_heading('Linear Regression', 0)

p = document.add_paragraph('Here we are trying to find what will be the result of trying diffent combinations of features. ')

# p.add_run('bold').bold = True
# p.add_run(' and some ')
# p.add_run('italic.').italic = True

In [None]:
def get_every_thing(to_use, to_find):
    reg = LinearRegression() 
    reg.fit(df[[to_use]], df[to_find])
    y_pred = reg.predict(df[[to_use]])
    plt.figure(figsize=(10,5))
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(df[to_use], df[to_find])
    plt.plot(df[[to_use]], y_pred, color = 'black')
    plt.savefig('monty-truth.png')
    plt.show()
    
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    r = round(r2_score(df[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(df[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(df[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    
    
    
    document.add_paragraph(
    [str(('R2: %.3f' % r)),
    str(('MSE: %.3f' % s)),
    str(('MAE: %.3f' % a))],style='List Bullet'
    )

#     document.add_page_break()
    
#     print('R2: %.3f' % r2_score(df[to_find], y_pred))
#     print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
#     print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))

In [None]:
r2 = []
mse = []
mae = []
par = []


for first , second in itertools.combinations(options,2):
    
    document.add_heading("{} {}".format(first, second), level=1)
#     print(first, second)
    par.append("{} {}".format(first, second))
    get_every_thing(first, second)

document.add_page_break()
table = document.add_table(rows=1, cols=4)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Pair'
hdr_cells[1].text = 'R2'
hdr_cells[2].text = 'MSE'
hdr_cells[3].text = 'MAE'

for i in range(len(r2)):
    row_cells = table.add_row().cells
    row_cells[0].text = str(par[i])
    row_cells[1].text = r2[i]
    row_cells[2].text = mse[i]
    row_cells[3].text = mae[i]
    print(str(par[i]), str(r2[i]),str(mse[i]),str(mae[i]))
    
document.add_paragraph(theory)
    
print(r2, mse, mae, end='\n')
    
document.save('/Users/piyush2017/Downloads/LinearRegression03.docx')

In [None]:
reg = [("LinearRegression",LinearRegression()), ("Lasso", Lasso()), ("Ridge",Ridge())]
print(reg)

In [None]:
print(theory)

# ==============Don't touch it.================

In [None]:
theory = """
# R2:
### r-squared
The close to 1 it is the better then it says that the value us used for prediction is moving with the final value that you want.

https://www.investopedia.com/terms/r/r-squared.asp

# MSE: 
### mean squared error (MSE)
It calulate the distace of value from the mean:

An MSE of zero, meaning that the estimator <b> Z </b> predicts observations of the parameter <b> X </b>  with perfect accuracy, is the ideal, but is typically not possible.

https://en.wikipedia.org/wiki/Mean_squared_error

# MAE:
### mean absolute error (MAE)

It's basically the absolute error |x-x| so lower the value the better """

In [None]:
df = pd.read_csv('./data/scaled_datatrainingcopy.csv')

In [None]:
# all this code is to start the document 

from docx import Document
from docx.shared import Inches

document = Document()

document.add_heading('Linear Regression', 0)

p = document.add_paragraph('Here we are trying to find what will be the result of trying diffent combinations of features. ')

# p.add_run('bold').bold = True
# p.add_run(' and some ')
# p.add_run('italic.').italic = True

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [None]:
reg = [("LinearRegression",LinearRegression()), ("Lasso", Lasso()), ("Ridge",Ridge())]
print(reg)

In [None]:
def get_every_thing(to_use, to_find):
#     reg = LinearRegression() 
    reg.fit(df[[to_use]], df[to_find])
    y_pred = reg.predict(df[[to_use]])
    plt.figure(figsize=(10,5))
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(df[to_use], df[to_find])
    plt.plot(df[[to_use]], y_pred, color = 'black')
    plt.savefig('monty-truth.png')
    #plt.show()
    plt.clf()
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    r = round(r2_score(df[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(df[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(df[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    
    
    
    document.add_paragraph(
    [str(('R2: %.3f' % r)),
    str(('MSE: %.3f' % s)),
    str(('MAE: %.3f' % a))],style='List Bullet'
    )

#     document.add_page_break()
    
#     print('R2: %.3f' % r2_score(df[to_find], y_pred))
#     print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
#     print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))

In [None]:


for name, reg in reg:
    r2 = []
    mse = []
    mae = []
    par = []
    
    document.add_heading("Here are the results of {}".format(name), level=2)
    
    for first , second in itertools.combinations(options,2):

        document.add_heading("{} {}".format(first, second), level=1)
    #     print(first, second)
        par.append("{} {}".format(first, second))
        get_every_thing(first, second)

    document.add_page_break()
    table = document.add_table(rows=1, cols=4)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Pair'
    hdr_cells[1].text = 'R2'
    hdr_cells[2].text = 'MSE'
    hdr_cells[3].text = 'MAE'

    for i in range(len(r2)):
        row_cells = table.add_row().cells
        row_cells[0].text = str(par[i])
        row_cells[1].text = r2[i]
        row_cells[2].text = mse[i]
        row_cells[3].text = mae[i]
        print(str(par[i]), str(r2[i]),str(mse[i]),str(mae[i]))

    document.add_paragraph(theory)

    print(r2, mse, mae, end='\n')
    
document.save('/Users/piyush2017/Downloads/LinearRegression05.docx')

# ==============Don't touch it.===all three data sets==

In [None]:
options = ['Temperature', 'Humidity', 'Light','CO2', 'HumidityRatio']

In [None]:
df = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/jupyter_notebooks/data/scaled_datatrainingcopy.csv')
ts1 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/jupyter_notebooks/data/scaled_datatestcopy.csv')
ts2 = pd.read_csv('/Users/piyush2017/Code/2020_Data_Mining_Project_02/data_mining_2020_project_occupancy_detection/jupyter_notebooks/data/scaled_datatest2copy.csv')

In [None]:
theory = """
# R2:
### r-squared
The close to 1 it is the better then it says that the value us used for prediction is moving with the final value that you want.

https://www.investopedia.com/terms/r/r-squared.asp

# MSE: 
### mean squared error (MSE)
It calulate the distace of value from the mean:

An MSE of zero, meaning that the estimator <b> Z </b> predicts observations of the parameter <b> X </b>  with perfect accuracy, is the ideal, but is typically not possible.

https://en.wikipedia.org/wiki/Mean_squared_error

# MAE:
### mean absolute error (MAE)

It's basically the absolute error |x-x| so lower the value the better """

In [None]:
# all this code is to start the document 

from docx import Document
from docx.shared import Inches

document = Document()

document.add_heading('Linear Regression', 0)

p = document.add_paragraph('Here we are trying to find what will be the result of trying diffent combinations of features. ')

# p.add_run('bold').bold = True
# p.add_run(' and some ')
# p.add_run('italic.').italic = True

In [None]:
r2 = []
mse = []
mae = []
par = []

def get_every_thing(to_use, to_find):
#     reg = LinearRegression() 
    reg.fit(df[[to_use]], df[to_find])
    y_pred = reg.predict(df[[to_use]])
    plt.figure(figsize=(10,5))
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(df[to_use], df[to_find])
    plt.plot(df[[to_use]], y_pred, color = 'black')
    plt.savefig('monty-truth.png')
    #plt.show()
    plt.clf()
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    r = round(r2_score(df[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(df[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(df[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    
    
    
    document.add_paragraph(
    [str(('R2: %.3f' % r)),
    str(('MSE: %.3f' % s)),
    str(('MAE: %.3f' % a))],style='List Bullet'
    )
    
    #----------------------For two test sets------------------------------
    
    y_pred = reg.predict(ts1[[to_use]])
    plt.figure(figsize=(10,5))
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(ts1[to_use], ts1[to_find])
    plt.plot(ts1[[to_use]], y_pred, color = 'black')
    plt.savefig('monty-truth.png')
    #plt.show()
    plt.clf()
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    r = round(r2_score(ts1[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(ts1[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(ts1[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    
    
    
    document.add_paragraph(
    [str(('R2: %.3f' % r)),
    str(('MSE: %.3f' % s)),
    str(('MAE: %.3f' % a))],style='List Bullet')
    
    
    y_pred = reg.predict(ts2[[to_use]])
    plt.figure(figsize=(10,5))
    plt.style.context('ggplot')
    plt.xlabel(to_use)
    plt.ylabel(to_find)
    plt.scatter(ts2[to_use], ts2[to_find])
    plt.plot(ts2[[to_use]], y_pred, color = 'black')
    plt.savefig('monty-truth.png')
    #plt.show()
    #plt.clf() https://stackoverflow.com/questions/8213522/when-to-use-cla-clf-or-close-for-clearing-a-plot-in-matplotlib
    plt.close()
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    
    # to get the numerical attribute
    r = round(r2_score(ts2[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(ts2[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(ts2[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    
    
    
    document.add_paragraph(
    [str(('R2: %.3f' % r)),
    str(('MSE: %.3f' % s)),
    str(('MAE: %.3f' % a))],style='List Bullet'
    )
    

#     document.add_page_break()
    
#     print('R2: %.3f' % r2_score(df[to_find], y_pred))
#     print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
#     print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error #this are need to get the results

In [None]:
reg = [("LinearRegression",LinearRegression())]#, ("Lasso", Lasso()), ("Ridge",Ridge())]
print(reg)

In [None]:


for name, reg in reg:
#     r2 = []
#     mse = []
#     mae = []
#     par = []
    
    document.add_heading("Here are the results of {}".format(name), level=2)
    
    for first , second in itertools.combinations(options,2):

        document.add_heading("{} {}".format(first, second), level=1)
    #     print(first, second)
        par.append("{} {}".format(first, second))
        get_every_thing(first, second)

    document.add_page_break()
    table = document.add_table(rows=1, cols=4)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Pair'
    hdr_cells[1].text = 'R2'
    hdr_cells[2].text = 'MSE'
    hdr_cells[3].text = 'MAE'

#     for i in range(len(r2)):
#         row_cells = table.add_row().cells
#         row_cells[0].text = str(par[i])
#         row_cells[1].text = r2[i]
#         row_cells[2].text = mse[i]
#         row_cells[3].text = mae[i]
#         print(str(par[i]), str(r2[i]),str(mse[i]),str(mae[i]))

    document.add_paragraph(theory)

    print(r2, mse, mae, end='\n')
    
document.save('/Users/piyush2017/Downloads/LinearRegression07.docx')

In [None]:
len(r2)

In [None]:
r2df = pd.DataFrame(r2)
msedf = pd.DataFrame(mse)
maedf = pd.DataFrame(mae)

In [None]:
result_measure = pd.concat([r2df, msedf, maedf], axis=1)

In [None]:
result_measure.to_csv('linear_table.csv')

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error #this are need to get the results

In [None]:
reg = [("LinearRegression",LinearRegression())]#, ("Lasso", Lasso()), ("Ridge",Ridge())]
print(reg)

In [None]:


def get_every_thing(to_use, to_find):
    
    r2 = []
    mse = []
    mae = []
    par = []
#     reg = LinearRegression() 
    reg.fit(df[[to_use]], df[to_find])
    y_pred = reg.predict(df[[to_use]])
#     plt.figure(figsize=(10,5))
#     plt.style.context('ggplot')
#     plt.xlabel(to_use)
#     plt.ylabel(to_find)
#     plt.scatter(df[to_use], df[to_find])
#     plt.plot(df[[to_use]], y_pred, color = 'black')
#     plt.savefig('monty-truth.png')
#     #plt.show()
#     plt.clf()
    
#     document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    r = round(r2_score(df[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(df[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(df[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    par.append(str(to_use+ "/" to_find))
    
    
    document.add_paragraph(
    [str(('R2: %.3f' % r)),
    str(('MSE: %.3f' % s)),
    str(('MAE: %.3f' % a))],style='List Bullet'
    )
    
    #----------------------For two test sets------------------------------
    
    y_pred = reg.predict(ts1[[to_use]])
#     plt.figure(figsize=(10,5))
#     plt.style.context('ggplot')
#     plt.xlabel(to_use)
#     plt.ylabel(to_find)
#     plt.scatter(ts1[to_use], ts1[to_find])
#     plt.plot(ts1[[to_use]], y_pred, color = 'black')
#     plt.savefig('monty-truth.png')
#     #plt.show()
#     plt.clf()
    
    document.add_picture('monty-truth.png', width=Inches(4))
    
    # to get the numerical attribute
    r = round(r2_score(ts1[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(ts1[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(ts1[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    par.append(str(to_use+ "/" to_find))
    
    
#     document.add_paragraph(
#     [str(('R2: %.3f' % r)),
#     str(('MSE: %.3f' % s)),
#     str(('MAE: %.3f' % a))],style='List Bullet')
    
    
    y_pred = reg.predict(ts2[[to_use]])
#     plt.figure(figsize=(10,5))
#     plt.style.context('ggplot')
#     plt.xlabel(to_use)
#     plt.ylabel(to_find)
#     plt.scatter(ts2[to_use], ts2[to_find])
#     plt.plot(ts2[[to_use]], y_pred, color = 'black')
#     plt.savefig('monty-truth.png')
#     #plt.show()
#     #plt.clf() https://stackoverflow.com/questions/8213522/when-to-use-cla-clf-or-close-for-clearing-a-plot-in-matplotlib
#     plt.close()
    
#     document.add_picture('monty-truth.png', width=Inches(4))
    
    
    # to get the numerical attribute
    r = round(r2_score(ts2[to_find], y_pred), ndigits=3)
    s = round(mean_squared_error(ts2[to_find], y_pred), ndigits=3)
    a = round(mean_absolute_error(ts2[to_find], y_pred))
    
    r2.append(str(r))
    mse.append(str(s))
    mae.append(str(a))
    par.append(str(to_use+ "/" to_find))
    
    
    r2df = pd.DataFrame(r2)
    msedf = pd.DataFrame(mse)
    maedf = pd.DataFrame(mae)
    pardf = pd.DataFrame(par)
    
    result_measure = pd.concat([par,r2df, msedf, maedf], axis=1)
    
    
#     document.add_paragraph(
#     [str(('R2: %.3f' % r)),
#     str(('MSE: %.3f' % s)),
#     str(('MAE: %.3f' % a))],style='List Bullet'
#     )
    

#     document.add_page_break()
    
#     print('R2: %.3f' % r2_score(df[to_find], y_pred))
#     print('MSE: %.3f' % mean_squared_error(df[to_find], y_pred))
#     print('MAE: %.3f' % mean_absolute_error(df[to_find], y_pred))

## Getting result for all the datasets

In [9]:
options = ['Temperature', 'Humidity', 'Light','CO2', 'HumidityRatio']

In [10]:
# for loading all the data
df = pd.read_csv('./data/scaled_datatrainingcopy.csv')
ts1 = pd.read_csv('./data/scaled_datatestcopy.csv')
ts2 = pd.read_csv('./data/scaled_datatest2copy.csv')

data_sets = [df, ts1, ts2]

In [11]:
def get_every_thing(to_use, to_find):
    
    
    reg.fit(df[[to_use]], df[to_find])
    
    for i in data_sets:
        y_pred = reg.predict(i[[to_use]])

        # to get the numerical attribute
        r = round(r2_score(i[to_find], y_pred), ndigits=3)
        s = round(mean_squared_error(i[to_find], y_pred), ndigits=3)
        a = round(mean_absolute_error(i[to_find], y_pred))

        r2.append(str(r))
        mse.append(str(s))
        mae.append(str(a))
        par.append(str(to_use + to_find))


        

In [22]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error #this are need to get the results
reg = [("Lasso",Lasso())]#, ("Lasso", Lasso()), ("Ridge",Ridge())]


r2 = []
mse = []
mae = []
par = []

for name, reg in reg:    
    for first , second in itertools.combinations(options,2):
        get_every_thing(first, second)

r2df = pd.DataFrame(r2)
msedf = pd.DataFrame(mse)
maedf = pd.DataFrame(mae)
pardf = pd.DataFrame(par)

result_measure = pd.concat([pardf,r2df, msedf, maedf], axis=1)

result_measure.to_csv('/Users/piyush2017/Desktop/line04.csv')

In [23]:
pardf

Unnamed: 0,0
0,TemperatureHumidity
1,TemperatureHumidity
2,TemperatureHumidity
3,TemperatureLight
4,TemperatureLight
5,TemperatureLight
6,TemperatureCO2
7,TemperatureCO2
8,TemperatureCO2
9,TemperatureHumidityRatio


In [21]:
par

['TemperatureHumidity',
 'TemperatureHumidity',
 'TemperatureHumidity',
 'TemperatureLight',
 'TemperatureLight',
 'TemperatureLight',
 'TemperatureCO2',
 'TemperatureCO2',
 'TemperatureCO2',
 'TemperatureHumidityRatio',
 'TemperatureHumidityRatio',
 'TemperatureHumidityRatio',
 'HumidityLight',
 'HumidityLight',
 'HumidityLight',
 'HumidityCO2',
 'HumidityCO2',
 'HumidityCO2',
 'HumidityHumidityRatio',
 'HumidityHumidityRatio',
 'HumidityHumidityRatio',
 'LightCO2',
 'LightCO2',
 'LightCO2',
 'LightHumidityRatio',
 'LightHumidityRatio',
 'LightHumidityRatio',
 'CO2HumidityRatio',
 'CO2HumidityRatio',
 'CO2HumidityRatio']

In [16]:
r2df = pd.DataFrame(r2)
msedf = pd.DataFrame(mse)
maedf = pd.DataFrame(mae)
pardf = pd.DataFrame(par)

In [None]:
result_measure = pd.concat([par,r2df, msedf, maedf], axis=1)

In [None]:
result_measure.to_csv('/Users/piyush2017/Downloads/linear_table.csv')