In [31]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,KFold, cross_val_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [32]:
df = pd.read_csv('final_data.csv')
ecodistricts = df['ECODISTRICT_ID'].unique()
pca = PCA()

# Define cross-validation folds
cv = KFold(n_splits=5, shuffle=True, random_state=42)
data = []

In [33]:
for i in range(0, len(ecodistricts)):
    df_new = df[df['ECODISTRICT_ID'] == ecodistricts[i]]
    unique_townships = len(df_new['TWP_ID'].unique())
    df_new.drop(['TWP_ID','ECODISTRICT_ID','YEAR'], axis=1, inplace= True)
    Y = df_new['YieldKgAcre']
    df_new.drop('YieldKgAcre', axis=1, inplace= True)
    X = df_new
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

    # Run standardization on X variables
    X_train_scaled = scale(X_train)
    X_test_scaled = scale(X_test)


    # Linear Regression
    lin_reg = LinearRegression().fit(X_train_scaled, y_train)
    lr_score_train = -1 * cross_val_score(lin_reg, X_train_scaled, y_train, cv=cv, scoring='neg_root_mean_squared_error').mean()
    lr_score_test = mean_squared_error(y_test, lin_reg.predict(X_test_scaled), squared=False)

    # Generate all the principal components
    ## pca = PCA() # Default n_components = min(n_samples, n_features)
    X_train_pc = pca.fit_transform(X_train_scaled)

    # View first 5 rows of all principal components
    pd.DataFrame(pca.components_.T).loc[:4,:]

    result = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
    for j in range(0, len(result)):
        if result[j] > 95:
            index = j
            variance = result[j]
            break
    # Initialize linear regression instance
    lin_reg = LinearRegression()

    # determine optimal number of principal components
    best_pc_num = index + 1

    # Train model with first best principal components
    lin_reg_pc = LinearRegression().fit(X_train_pc[:,:best_pc_num], y_train)

    # Get cross-validation RMSE (train set)
    pcr_score_train = -1 * cross_val_score(lin_reg_pc, X_train_pc[:,:best_pc_num], y_train, cv=cv, scoring='neg_root_mean_squared_error').mean()

    # Train model on training set
    lin_reg_pc = LinearRegression().fit(X_train_pc[:,:best_pc_num], y_train)

    # Get first 20 principal components of test set
    X_test_pc = pca.transform(X_test_scaled)[:,:best_pc_num]

    # Predict on test data
    preds = lin_reg_pc.predict(X_test_pc)
    pcr_score_test = mean_squared_error(y_test, preds, squared=False)
    
    row = []

    pcr_score_train = round(pcr_score_train, 2)
    pcr_score_test = round(pcr_score_test, 2)
    r_squared_train = round(lin_reg_pc.score(X_train_pc[:,:best_pc_num], y_train)*100, 2)
    r_squared_test = round(lin_reg_pc.score(X_test_pc[:,:best_pc_num], y_test)*100, 2)

    # Calculate the absolute errors
    errors = abs(preds - y_test)
    # Print out the mean absolute error (mae)
    mae = round(np.mean(errors), 2)
    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errors / y_test)
    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    accuracy = round(accuracy, 2)
    
    row.append(ecodistricts[i])
    row.append(len(df_new))
    row.append(unique_townships)
    row.append(pcr_score_train)
    row.append(pcr_score_test)
    row.append(r_squared_train)
    row.append(r_squared_test)
    row.append(mae)
    row.append(accuracy)
    row.append(best_pc_num)
    row.append(variance)
    data.append(row)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

In [34]:
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Ecodistrict ID', '# Records','# Unique Townships','MSE Training','MSE Test','R Squared Training','R Squared Test','Mean Absolute Error (Degrees)','Accuracy (%)','# Principal Components','Explained Variance (%)'])

In [35]:
df.to_csv('model_summary.csv')