# Cortex Game Python Demo-Round 1

Import SASpy and connect to your SAS studio session


(Instruction of SASpy install and configuaration:
https://support.sas.com/ondemand/saspy.html )

In [1]:
from saspy import SASsession
sas_session = SASsession()
sas_session

Using SAS Config named: oda
SAS Connection established. Subprocess id is 2050



Access Method         = IOM
SAS Config name       = oda
SAS Config file       = /Users/sunxinyi/opt/anaconda3/lib/python3.8/site-packages/saspy/sascfg_personal.py
WORK Path             = /saswork/SAS_work46E70000FFA0_odaws04-usw2.oda.sas.com/SAS_workA2320000FFA0_odaws04-usw2.oda.sas.com/
SAS Version           = 9.04.01M6P11072018
SASPy Version         = 3.7.2
Teach me SAS          = False
Batch                 = False
Results               = Pandas
SAS Session Encoding  = utf-8
Python Encoding value = utf-8
SAS process Pid value = 65440


Load raw datasets from SAS Studio (Change the folder path to where your "Cortex Data Sets" folder located.)

In [2]:
%%SAS sas_session

libname cortex '/home/u58717790/my_shared_file_links/u39842936/Cortex Data Sets';

# Step1-2 Merge and DataPartition

link for reference: 

Pandas library: https://pandas.pydata.org/docs/user_guide/index.html


sklearn.model_selection: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [3]:
import pandas as pd

#comment: Transform cloud sas dataset to python datafrmae(pandas) ==> might take some time.

data1 = sas_session.sasdata2dataframe(
table='hist',
libref='cortex'
)

data2 = sas_session.sasdata2dataframe(
table='target_rd1',
libref='cortex'
)

In [4]:
#Step1 Merge the Data
data_merge = pd.merge(data1, data2, on=["ID"],how="right")
data_merge.head()
#Deal with Missing Value 
data_merge = data_merge.dropna()  #comment: maybe another method for missing data, check it later
data_merge.head()

#Step2 Data Partition 
#this is just a sample, you could use another library or bulit in funciton
from sklearn.model_selection import train_test_split
train, validation = train_test_split(data_merge, test_size=0.4) # you can change the percentage
train.head()

Unnamed: 0,ID,LastName,FirstName,Woman,Age,Salary,Education,City,SeniorList,NbActivities,...,Recency,Frequency,Seniority,TotalGift,MinGift,MaxGift,GaveLastYear,AmtLastYear,GaveThisYear,AmtThisYear
653803,2653804.0,VERNON,JUSTINE,1.0,51.0,193500.0,University / College,City,5.0,0.0,...,4.0,1.0,4.0,10.0,10.0,10.0,1.0,10.0,0.0,0.0
185636,2185637.0,BERTRAM,JACK,0.0,59.0,20900.0,University / College,Downtown,9.0,2.0,...,1.0,5.0,9.0,175.0,20.0,75.0,0.0,0.0,0.0,0.0
996577,2996578.0,SHEARER,FRANKLIN,0.0,31.0,96300.0,University / College,City,7.0,0.0,...,3.0,1.0,3.0,30.0,30.0,30.0,1.0,20.0,0.0,0.0
686474,2686475.0,STRICKLEN,RAMON,0.0,59.0,11100.0,High School,City,6.0,1.0,...,2.0,1.0,2.0,20.0,20.0,20.0,0.0,0.0,0.0,0.0
897387,2897388.0,FRANKLIN,EVELYN,1.0,41.0,50100.0,University / College,City,3.0,1.0,...,3.0,1.0,3.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0


# Step3 Linear Regression Model (Py)


link for reference:

sk-learn library: https://scikit-learn.org/stable/index.html

In [5]:
from sklearn import linear_model

#comment: it's numpy array
X_train = train[['Age', 'Salary','Seniority', 'AmtLastYear']] 
Y_train = train['AmtThisYear']
X_valid = validation[['Age', 'Salary','Seniority', 'AmtLastYear']] 
Y_valid = validation['AmtThisYear']

regr = linear_model.LinearRegression()
regr.fit(X_train,Y_train)
regr_predict=regr.predict(X_valid)

In [6]:
#you can change the criteria

import numpy as np
from sklearn import metrics
#MAE
print(metrics.mean_absolute_error(Y_valid,regr_predict))
#MSE
print(metrics.mean_squared_error(Y_valid,regr_predict))
#RMSE
print(np.sqrt(metrics.mean_squared_error(Y_valid,regr_predict)))

19.89312383855728
11294.62749871877
106.27618500265602


# Step4 Decision Tree Model（Py）

In [7]:
from sklearn.tree import DecisionTreeRegressor

X_train = train[['Age', 'Salary','Seniority', 'AmtLastYear']] 
Y_train = train['AmtThisYear']
X_valid = validation[['Age', 'Salary','Seniority', 'AmtLastYear']] 
Y_valid = validation['AmtThisYear']

DT_model = DecisionTreeRegressor(max_depth=5).fit(X_train,Y_train)
DT_predict = DT_model.predict(X_valid) #Predictions on Testing data

In [8]:
#you can change the criteria
#MAE
print(metrics.mean_absolute_error(Y_valid,DT_predict))
#MSE
print(metrics.mean_squared_error(Y_valid,DT_predict))
#RMSE
print(np.sqrt(metrics.mean_squared_error(Y_valid,DT_predict)))

19.9539424226544
11555.768208159234
107.49775908436061


### Other models may also be helpful for this game:
Link for reference:
1. Logistic regression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regression#sklearn.linear_model.LogisticRegression
2. Neural Net: https://scikit-learn.org/stable/modules/classes.html?highlight=neural%20net#module-sklearn.neural_network  

# Step6 Scoring the data

Pick the best model from previous steps and use it to predict for next year donation.

In [9]:
data3 = sas_session.sasdata2dataframe(
table='score_rd1',
libref='cortex'
)
data4 = sas_session.sasdata2dataframe(
table='score',
libref='cortex'
)

In [10]:
scoring_data = pd.merge(data3, data4, on=["ID"],how="right")
scoring_data = scoring_data.dropna() 
scoring_data.head()

Unnamed: 0,ID,GaveLastYear,AmtLastYear,LastName,FirstName,Woman,Age,Salary,Education,City,SeniorList,NbActivities,Referrals,Recency,Frequency,Seniority,TotalGift,MinGift,MaxGift
0,2000001.0,0.0,0.0,ROMMES,RODNEY,0.0,25.0,107200.0,University / College,City,2.0,0.0,0.0,1.0,2.0,2.0,1010.0,10.0,1000.0
9,2000010.0,0.0,0.0,ALLEN,CHRISTY,1.0,36.0,99100.0,University / College,City,2.0,1.0,0.0,0.0,1.0,0.0,25.0,25.0,25.0
14,2000015.0,0.0,0.0,HOLMES,LEANN,1.0,39.0,91800.0,University / College,City,10.0,5.0,3.0,1.0,2.0,9.0,70.0,20.0,50.0
15,2000016.0,1.0,40.0,JOHNSTON,DONNA,1.0,28.0,72400.0,High School,Suburban,8.0,0.0,1.0,7.0,2.0,8.0,50.0,20.0,30.0
16,2000017.0,1.0,100.0,OLAGUE,DONNA,1.0,50.0,13000.0,High School,Suburban,0.0,0.0,0.0,0.0,1.0,0.0,20.0,20.0,20.0


In [11]:
X = scoring_data[['Age','Salary','Seniority','AmtLastYear']] 
regr_predict_end=regr.predict(X)
scoring_data['Prediction'] = regr_predict_end

In [12]:
scoring_data.sort_values(by=['Prediction'], inplace=True,ascending=False)
scoring_data.head()

Unnamed: 0,ID,GaveLastYear,AmtLastYear,LastName,FirstName,Woman,Age,Salary,Education,City,SeniorList,NbActivities,Referrals,Recency,Frequency,Seniority,TotalGift,MinGift,MaxGift,Prediction
954313,2954314.0,1.0,10000.0,SANCHEZ,JADA,1.0,37.0,222700.0,High School,Suburban,10.0,2.0,1.0,1.0,4.0,9.0,95.0,10.0,40.0,268.912052
537220,2537221.0,1.0,10000.0,BROWN,JEFFREY,0.0,48.0,212700.0,University / College,Suburban,0.0,0.0,0.0,0.0,1.0,0.0,100.0,100.0,100.0,265.001148
94130,2094131.0,1.0,10000.0,HOLDEN,SARAH,1.0,62.0,210000.0,University / College,Downtown,10.0,1.0,3.0,0.0,3.0,6.0,50.0,10.0,25.0,264.994888
631673,2631674.0,1.0,10000.0,KOPPENHEFFER,JENNIFER,1.0,34.0,186500.0,University / College,City,9.0,3.0,1.0,0.0,1.0,0.0,500.0,500.0,500.0,264.824929
334249,2334250.0,1.0,10000.0,MANLEY,COLLEEN,1.0,36.0,108300.0,University / College,Suburban,6.0,4.0,3.0,5.0,2.0,6.0,45.0,20.0,25.0,261.566163


# Step7 Exporting CSV (Py)

In [13]:
Result= scoring_data[['ID','Prediction']]
Result.to_csv('Round1_Output.csv', index=False)