In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,VotingRegressor,StackingRegressor
from xgboost import XGBRegressor

from lib.feature_generator import FeatureGenerator

In [2]:
parent_dir_path = os.path.dirname(os.getcwd())
segment_data_path =  os.path.join(parent_dir_path,'data'+ os.path.sep +'segments'+os.path.sep)
main_data_path = os.path.join(parent_dir_path,'data'+ os.path.sep +'main'+os.path.sep)

In [3]:
segment_key = 'Private_Not for Profit_BSN_Great Lakes'

In [4]:
df = pd.read_csv(segment_data_path + segment_key + '.csv').drop('Unnamed: 0',axis=1)

In [5]:
feature_generator=FeatureGenerator(lag_window=4,rolmean_window=4,rolstd_window=4)

initialize feature generator


In [6]:
feature_df = feature_generator.generate_lags(df)
feature_df = feature_generator.generate_rolling_mean(feature_df)
feature_df = feature_generator.generate_rolling_std(feature_df)
feature_df = feature_generator.generate_billing_student_diff(feature_df)

  df[f'#TEASexamsAttempt{j}Advanced_rolling_std_{i}']= df[f'#TEASexamsAttempt{j}Advanced'].rolling(window=i).std()
  df[f'#TEASexamsAttempt{j}Exemplary_rolling_std_{i}'] = df[f'#TEASexamsAttempt{j}Exemplary'].rolling(window=i).std()
  df[f'#TEASexamsAttempt{j}Incomplete_rolling_std_{i}'] = df[f'#TEASexamsAttempt{j}Incomplete'].rolling(window=i).std()
  df[f'#TEASexamsAttempt{j}rolling_std_{i}'] = df[f'#TEASexamsAttempt{j}'].rolling(window=i).std()
  df[f'#TEASexamsAttempt{j}Developmentalrolling_std_{i}']= df[f'#TEASexamsAttempt{j}Developmental'].rolling(window=i).std()
  df[f'#TEASexamsAttempt{j}Basic_rolling_std_{i}']= df[f'#TEASexamsAttempt{j}Basic'].rolling(window=i).std()
  df[f'#TEASexamsAttempt{j}Proficient_rolling_std_{i}']= df[f'#TEASexamsAttempt{j}Proficient'].rolling(window=i).std()
  df[f'billed_rolstd_{i}'] = df['BilledStudentsPmt1'].rolling(window=i).std()
  df[f'registered_rolstd_{i}'] = df['#UsersRegistered'].rolling(window=i).std()
  df['NumberBilledStudents_diff'] = df

In [7]:
feature_df=feature_df.drop(['Year', 'Quarter', 'Month'],axis=1)
feature_df = feature_df.dropna()

In [8]:
corr = feature_df.corr()
corr['select_corr']=corr['BilledStudentsPmt1'].apply(lambda x : x > 0.4 or x < -0.4)
corr = corr[corr['select_corr']==True]
corr_features = list(corr.index)
#corr_features.remove('BilledStudentsPmt1')  

In [10]:
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(feature_df), columns=feature_df.columns)  

In [11]:
split_len = len(df_scaled)-4 ##leaving out last 3 quarters for prediction
df_train = df_scaled[:split_len]
df_test = df_scaled[-4:]

#len(df_train),len(df_test)
x_train= df_train.drop('BilledStudentsPmt1',axis=1)
y_train = df_train[['BilledStudentsPmt1']]

x_test=df_test.drop('BilledStudentsPmt1',axis=1)
y_test = df_test[['BilledStudentsPmt1']]

In [12]:
rf = RandomForestRegressor()
xgb = XGBRegressor()

In [13]:
# Fit the model
rf.fit(x_train, y_train)

features = list(x_train.columns)
importances = rf.feature_importances_

# Create a dataframe for visualization
importances_df = pd.DataFrame({
    'Features': features,
    'Importance': importances
})

rf_features = importances_df.head(10)['Features'].tolist()

  


In [14]:
rf_features

['#TEASexamsAttempt1',
 '#TEASexamsAttempt1Developmental',
 '#TEASexamsAttempt1Basic',
 '#TEASexamsAttempt1Proficient',
 '#TEASexamsAttempt1Advanced',
 '#TEASexamsAttempt1Exemplary',
 '#TEASexamsAttempt1Incomplete',
 '#TEASexamsAttempt2',
 '#TEASexamsAttempt2Developmental',
 '#TEASexamsAttempt2Basic']

In [15]:
xgb.fit(x_train, y_train)
features = list(x_train.columns)
importances = xgb.feature_importances_

# Create a dataframe for visualization
importances_df = pd.DataFrame({
    'Features': features,
    'Importance': importances
})

# Sort the dataframe by importance in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)
xgb_features = importances_df.head(10)['Features'].tolist()

In [18]:
selected_features = ['registered_lag_2',
                     'registered_rolmean_2',
                     'registered_rolstd_2',
                     'NumberBilledStudents_diff',
                     'billed_lag_2',
                     'billed_rolmean_2',
                     'billed_rolstd_2',
                     '#TEASexamsAttempt1lag_2',  
                     '#TEASexamsAttempt1rolling_std_2',  
                     '#TEASexamsAttempt1rolling_mean_2'                  
                                         ]

In [19]:
selected_features.append('BilledStudentsPmt1')

In [20]:
feature_df[selected_features].to_csv(main_data_path + segment_key + '_features.csv')