## First: tools from the toolbox

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression

import wrangle
import split_scale
# import features
# don't have features yet


### Acquire and split the data into train and test

In [25]:
df = wrangle.wrangle_grades()

### 1 - Split the data into train and test datasets

In [53]:
train, test = split_scale.split_my_data(df)


X_train = train.drop(columns="final_grade")
y_train = train[["final_grade"]]
X_test = test.drop(columns="final_grade")
y_test = test[["final_grade"]]

print(X_train.head())
print(X_test.head())
print("..................................................")
print(y_train.head())
print(y_test.head())

    student_id  exam1  exam2  exam3
83          84     79     70     85
10          11     58     65     70
55          56     83     80     86
43          44     93     90     96
23          24     58     65     70
    student_id  exam1  exam2  exam3
8            9     70     65     78
73          74     70     65     78
91          92    100     90     95
30          31     93     90     96
65          66    100     90     95
..................................................
    final_grade
83           81
10           68
55           85
43           97
23           68
    final_grade
8            77
73           77
91           96
30           97
65           96


### 2 - Create a model that uses exam1 to predict the final grade

#### a. SelectK will show us if it chose exam 1

In [37]:
# initialize f_selector - this defines the test for scoring and the # of features 
# (columns) we want to keep.

f_selector = SelectKBest(f_regression, k=1)

In [38]:
# Next, fit the object to the data
# Here, the f_selector is scoring, ranking, and identifying the top features (columns)

f_selector.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SelectKBest(k=1, score_func=<function f_regression at 0x12775edd0>)

In [39]:
# Now transform the datatset to reduce it to the best k feature, exam 1

X_reduced = f_selector.transform(X_train)

print(X_train.shape)
print(y_train.shape)

(76, 4)
(76, 1)


In [41]:
# Let's get an idea of which index was used by repping the columns (features) w/ bools:

f_support = f_selector.get_support()

print(f_support)

[False  True False False]


In [42]:
# Now we can use .loc with our mask, using '.columns' to get column names and
# convert those into a list using '.tolist()':

f_feature = X_train.loc[: , f_support].columns.tolist()

print(f_feature)

['exam1']


### 5 - Create a model that uses exam2 to predict the final grade:

In [63]:
# initialize f_selector - this defines the test for scoring and the # of features 
# (columns) we want to keep.

f_selector = SelectKBest(f_regression, k=2)

In [64]:
# Next, fit the object to the data
# Here, the f_selector is scoring, ranking, and identifying the top features (columns)

f_selector.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SelectKBest(k=2, score_func=<function f_regression at 0x12775edd0>)

In [65]:
# Now transform the datatset to reduce it to the best k features, exam1 and exam3

X_reduced = f_selector.transform(X_train)

print(X_train.shape)
print(y_train.shape)

(76, 4)
(76, 1)


In [66]:
# Let's get an idea of which index was used by repping the columns (features) w/ bools:

f_support = f_selector.get_support()

print(f_support)

[False  True False  True]


In [69]:
# Now we can use .loc with our mask, using '.columns' to get column names and
# convert those into a list using '.tolist()':

f_feature = X_train.loc[: , f_support].columns.tolist()

print(f_feature)

['exam1', 'exam3']


In [28]:
lm = LinearRegression()

rfe = RFE(lm, 2)

# Transform data using RFE:

X_rfe = rfe.fit_transform(X_train, y_train)

  y = column_or_1d(y, warn=True)


#### b. Get the list of feature names that were selected so we have some context 

In [13]:
# Mask up!

mask = rfe.support_

# Get the column names of the features that were selected and turn them into a list:

rfe_features = X_train.columns[mask]

# print them out

print(f"Selected {len(rfe_features)} features: ", ", ".join(rfe_features))

Selected 2 features:  exam1, exam3


#### Initialize the ML Algorithm Object.  Did that above, but not sure if that's the best one

In [14]:
lm = LinearRegression()
lm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#### Next, fit the model by feeding the training data into the models, and getting the "learned" parameters returned

In [15]:
# Fitting data to the model:

lm.fit(X_rfe, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
# Let's take a look at the parameters (the y-intercepts and coefficients) the
# the Linear Regression model calculated:

print("Linear Model:", lm)

print("Intercept: ", lm.intercept_)

print("Features: ", rfe_features)

print("Coefficients: ", lm.coef_)

Linear Model: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Intercept:  [11.54460153]
Features:  Index(['exam1', 'exam3'], dtype='object')
Coefficients:  [[0.60003062 0.27533144]]


In [18]:
predict = "final_grade"