# Strategy and Performance Management
### Prediction Model - Timp Health - Group Pre-Assignment
Group 4 : Kirtesh Patel, Nils Marthiensen, Neelesh Bhalla, Chia-Jung Chang

In [20]:
# Import of necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [21]:
# Import of the data and removal of non-relevent columns
df=pd.read_csv('Timp_Health_PartI_data-1.txt',delimiter='\t')
df.drop(['RecordID','MemberID'],axis=1,inplace=True)
df=df[~(df['GrossDrugCost']==0)]
df

Unnamed: 0,Month,GrossDrugCost,NLISDummy,LISCHOSERDummy,RiskScore,SpecialtyDummy,AdjudicationDays,Age,Gender,FrailityDummy,HospiceDummy,InstitutionDummy,ESRDDummy
0,6,1242.17,0,0,668.4,1,21,96,1,0,0,0,0
1,1,625.86,0,0,290.0,1,21,59,1,0,0,0,0
2,6,27.91,0,0,477.2,1,21,43,1,0,0,0,0
3,6,46451.23,0,0,2135.6,0,21,67,0,0,0,0,0
4,6,6.47,0,0,602.8,0,21,91,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30155,2,161.26,1,0,245.6,0,22,66,1,0,0,0,0
30156,3,337.39,1,0,245.6,0,22,66,1,0,0,0,0
30157,4,358.93,1,0,245.6,1,22,66,1,0,0,0,0
30158,5,73.92,1,0,245.6,0,22,66,1,0,0,0,0


In [22]:
# Test which X-Variables have high predictive power
# Take the logarithm of the dependent variable, if it is not equal to zero
df['GrossDrugCost'] = np.where(df['GrossDrugCost'] != 0, np.log(df['GrossDrugCost']), 0)

# Define the dependent variable
Y = df['GrossDrugCost']

# Define the independent variables
X = df[['Month', 'NLISDummy', 'LISCHOSERDummy', 'RiskScore', 'SpecialtyDummy',
        'AdjudicationDays', 'Age', 'Gender', 'FrailityDummy', 'HospiceDummy',
        'InstitutionDummy', 'ESRDDummy']]

# Create an empty DataFrame to store results
results_table = pd.DataFrame(columns=['Variable', 'R-squared'])

# Loop through each independent variable
for column in X.columns:
    # Create a new DataFrame with only the current independent variable
    X_single = X[[column]]
    
    # Fit a Linear Regression model without the intercept
    reg_model = sm.OLS(Y, X_single)
    results = reg_model.fit()
    
    # Store the R-squared value in the results_table
    results_table = pd.concat([results_table, pd.DataFrame({'Variable': [column], 'R-squared': [results.rsquared]})])

# Sort results table by highest R-Squared
results_table.sort_values(by='R-squared', ascending=False, inplace=True)

# Print the results table
print(results_table)

           Variable  R-squared
0  AdjudicationDays   0.900502
0               Age   0.859848
0             Month   0.829509
0         RiskScore   0.719060
0            Gender   0.573614
0    SpecialtyDummy   0.540474
0         NLISDummy   0.441198
0    LISCHOSERDummy   0.330518
0         ESRDDummy   0.020857
0      HospiceDummy   0.002237
0  InstitutionDummy   0.001820
0     FrailityDummy   0.000000


In [23]:
# Build the regression and show the results

# Define the independent variables that we want to use, judging by the analysis above
X = df[['Month', 'NLISDummy', 'LISCHOSERDummy', 'RiskScore', 'SpecialtyDummy', 'AdjudicationDays', 'Age', 'Gender']]

# Y is defined above and stays the same

# Add constant intercept term (Lowers R-squared to 0.228)
#X = sm.add_constant(X)

# Perform the regression analysis
model = sm.OLS(Y, X).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:          GrossDrugCost   R-squared (uncentered):                   0.925
Model:                            OLS   Adj. R-squared (uncentered):              0.925
Method:                 Least Squares   F-statistic:                          4.622e+04
Date:                Tue, 19 Sep 2023   Prob (F-statistic):                        0.00
Time:                        20:26:42   Log-Likelihood:                         -54237.
No. Observations:               30022   AIC:                                  1.085e+05
Df Residuals:                   30014   BIC:                                  1.086e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------

In [24]:
# Build the equation for our regression model

# Extract the coefficients from the model summary
coefficients = model.params

# Create the regression equation
equation = "Y = "

# Add the coefficients for each independent variable
for variable, coefficient in coefficients.items():
    equation += f"{coefficient:.4f}*{variable} + "

# Remove the trailing ' + ' at the end
equation = equation[:-3]

# Print the regression equation
print(equation)

Y = 0.0445*Month + 0.2090*NLISDummy + 0.5593*LISCHOSERDummy + 0.0009*RiskScore + 1.2307*SpecialtyDummy + 0.1871*AdjudicationDays + -0.0056*Age + -0.0669*Gender


### Short explanation

We started by checking out which varibles have strong predictive power. We then combined those in one model, reaching a R-squared value of 0.921. The precise results of the regression, as well as the model (equation), can be observed above.

R-squared was significantly lower when not taking the log of the dependent variable.

We did not include a constant intercept term, as this also significanly lowered the R-squared value (0.228).