# Torino Museum Data Analysis & Churn Prediction in R

In [1]:
library(readr)
library("caret")
library("data.table")
library("cluster")
library("factoextra")
library("gridExtra")
library("plyr")
library("corrplot")
library("unbalanced")
library("DMwR")
library("C50")
library("gains")
library("sqldf")
library(dplyr)
library(leaflet)
library(rgdal)
library(mice)
library(VIM)
library(anytime)
library(forcats)
library(ggplot2)
library(caret)
library(evtree)

Loading required package: lattice
Loading required package: ggplot2
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
corrplot 0.84 loaded
Loading required package: mlr
Loading required package: ParamHelpers

Attaching package: 'mlr'

The following object is masked from 'package:caret':

    train

Loading required package: foreach
Loading required package: doParallel
Loading required package: iterators
Loading required package: parallel
Loading required package: grid
Registered S3 method overwritten by 'xts':
  method     from
  as.zoo.xts zoo 
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Attaching package: 'DMwR'

The following object is masked from 'package:plyr':

    join

Loading required package: gsubfn
Loading required package: proto

Attaching package: 'gsubfn'

The following object is masked from 'package:mlr':

    fn

Loading required package: RSQLite

Attaching package: 'dplyr'


In [4]:
getwd()
setwd('C:/Users/Spectre/Desktop/Museum')


In [5]:
an13<-read.csv("an13.csv",header=TRUE)
in13<-read.csv("in13.csv",header=TRUE)
data1<-read.csv("data1.csv",header=TRUE)

In [7]:
head(data1)
head(an13)
head(in13)

X,codcliente,si2014,ultimo_ing.x,abb13,abb14
1,100005,0,2013-09-19,2013-01-27,
2,100006,1,2013-11-21,2013-01-14,2014-01-09
3,100007,0,,2012-12-05,
4,100017,1,2013-10-19,2012-12-10,2013-12-01
5,100020,0,2013-08-11,2012-12-09,
6,100021,0,2013-08-22,2013-02-18,


X,codcliente,data_inizio,importo,sconto,riduzione,tipo_pag,agenzia,agenzia_tipo,sesso,data_nascita,professione,comune,cap,nuovo_abb
1,90095,09/12/2012 00:00,28,NESSUNO SCONTO,ABBONAMENTO RIDOTTO SCONTATO,CONTANTI,INFOPIEMONTE TORINO CULTURA,PUNTO INFORMATIVO,M,1985,,TORINO,10100,NUOVO ABBONATO
2,89662,06/01/2013 00:00,28,NESSUNO SCONTO,ABBONAMENTO RIDOTTO SCONTATO,CONTANTI,INFOPIEMONTE TORINO CULTURA,PUNTO INFORMATIVO,M,1985,,TORINO,10129,NUOVO ABBONATO
3,202278,27/01/2013 00:00,28,NESSUNO SCONTO,ABBONAMENTO RIDOTTO SCONTATO,BANCOMAT,INFOPIEMONTE TORINO CULTURA,PUNTO INFORMATIVO,M,1992,,TORINO,10137,NUOVO ABBONATO
4,269895,02/12/2012 00:00,28,NESSUNO SCONTO,ABBONAMENTO RIDOTTO SCONTATO,CONTANTI,LIBRERIA MONDADORI,PUNTO COMMERCIALE,M,1986,,SVIZZERA,10036,NUOVO ABBONATO
5,88177,26/01/2013 00:00,28,NESSUNO SCONTO,ABBONAMENTO RIDOTTO SCONTATO,CONTANTI,INFOPIEMONTE TORINO CULTURA,PUNTO INFORMATIVO,M,1983,,TORINO,10149,NUOVO ABBONATO
6,125858,19/04/2013 00:00,28,NESSUNO SCONTO,ABBONAMENTO RIDOTTO SCONTATO,CONTANTI,INFOPIEMONTE TORINO CULTURA,PUNTO INFORMATIVO,F,1989,,TORINO,10100,NUOVO ABBONATO


X,datai,orai,importo,museo,prov_museo,com_museo,CodCliente
1,01/12/2012,10:10,7.5,REGGIA DI VENARIA REALE,TO,VENARIA REALE,46715
2,01/12/2012,10:11,5.0,REGGIA DI VENARIA REALE,TO,VENARIA REALE,46715
3,01/12/2012,10:11,7.5,REGGIA DI VENARIA REALE,TO,VENARIA REALE,133015
4,01/12/2012,10:12,5.0,REGGIA DI VENARIA REALE,TO,VENARIA REALE,133015
5,01/12/2012,10:18,2.5,MUSEO DELLA FRUTTA,TO,TORINO,80732
6,01/12/2012,10:39,5.0,REGGIA DI VENARIA REALE,TO,VENARIA REALE,16834


In [None]:
#Search for missing values in the dataset

In [None]:
colSums(is.na(an13))
colSums(is.na(in13))
colSums(is.na(data1))

In [None]:
#so imputing Sesso using Knn and removing col professione

In [None]:
an13$professione<-NA
an1<-kNN(an13, variable=c("sesso"), k=6)
head(an1)
an1<-an1[,-12]   #remove profession column

In [None]:
#check data_nascita 
str(an1) 
#strange level "1-01" and people with dob from 1900
##we decide to remove this value and all values of dob with ages95, children less than 10 are free
an2<- sqldf("Select * from an1
        where data_nascita!='1-01' AND data_nascita!='9-02' AND data_nascita>='1924'
            AND data_nascita<=2012")
#add age column to an2
an2<-an2[,-14]

head(an2)

In [None]:
an2$data_nascita<-as.character(an2$data_nascita)
an2$data_nascita<-as.numeric(an2$data_nascita)
an2$age<- (2019-an2$data_nascita)
head(an2)
levels(an2$riduzione)

In [None]:
##the data needs further cleaning because there are data points which apply pass 60 but age is 65 
age65<- sqldf("select * from an2
              where age>=65
              order by importo")
head(age65)

In [None]:
an3<- sqldf("Select distinct * from an2
            where cap>=10000
            group by codcliente
            order by cap")
head(an3)          
##check structure of data
str(an3)
levels(an3$tipo_pag)

In [None]:
##set tipo pagamento to NESSUM PAGAMENTO for importo=0
an5<- an3
str(an5)


In [None]:
an5$tipo_pag[an5$importo == 0] <- "NESSUN PAGAMENTO"

In [None]:
##check to see if cap is >10000 in an3
an4<- sqldf("Select count(*) from an3
            where cap<10000
            group by codcliente
            order by cap")
an4

In [None]:
##convert age to groups
min(an5$age)
max(an5$age)
head(an5)
?cut
an5$age <- cut(an5$age, breaks = c(7,19,35,50,65,max(an5$age)),
               labels = c("7-19","20-35","35-50","50-65","65+"), include.lowest = TRUE)
an5$age <- as.factor(an5$age)

In [None]:
##check other factors
str(an5)
head(an5)
an5<-an5[,-11]



## Data Analysis and Visualization using SQL

In [None]:
##pie chart distribution of customers by age

head(an5)
q1<-sqldf("select count(codcliente) AS freq, age from an5
          group by age
          order by freq")
head(q1)

In [None]:
percentage <- q1$freq / sum(q1$freq) * 100
q1<-cbind(q1,percentage)
q1$age <- as.character(q1$age)

q1$percentage <-round(q1$percentage, digits = 0)


mycols <- c("#008000", "#FFF44F", "#008080", "#000FFF", 
            "#F80000")

In [None]:
##plot
ggplot(q1, aes(x = "", y = percentage, fill = age)) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0)+
  scale_fill_manual(values = mycols) + 
  labs(title="Age wise customer ditribution")

In [None]:
#table form
q1

In [None]:
#top museum visited

mfreq<-sqldf("select museo,count(*) AS frequency from in13
      group by museo
      order by frequency DESC
             Limit 7")
View(mfreq)

In [None]:
mlat<-c(45.135163,45.070865,45.064770,45.069126,45.072811,45.068392,45.608223)
mlon<-c(7.625653,7.685953,7.669370,7.693096,7.686482,7.684416,7.744696)
museimap<-cbind(mfreq,mlat,mlon)

In [None]:
## now for each age group, find the gender distribution

In [None]:
q2<-sqldf("select count(codcliente) AS freq, age, sesso from an5
          group by age, sesso
          order by freq, sesso")
q2

In [None]:
# Plot
ggplot(q2, aes(fill=sesso, y=freq, x=age)) + 
  geom_bar(position="dodge", stat="identity")

In [None]:
##resultingly, the amount of teenagers is very less
#In the young adult 20-35 group, females are double
#population division in rest of the groups is similar
#exception is 35-50 yrs where gender is almost equally distributed

####finding mean importo of each age group
q3<-sqldf("select AVG(importo) AS mean, age from an5
          group by age
          order by importo DESC")

In [None]:
head(q3)
head(an13)
##the highest paying groups were 35-50,50-65
#whereas the least paying were teenagers and young adults

In [None]:
##find the 
q4<-sqldf("select AVG(importo) AS mean_importo, age from an5
          group by age
          order by importo DESC")
head(q4)

In [None]:
ggplot(q4, aes(x=reorder(age, -mean_importo), y=mean_importo)) + 
  geom_bar(stat="identity", width=0.8, fill="#191970") + 
  labs(title="Ordered Bar Chart",subtitle="mean_importo vs age group")+
  theme(axis.text.x=element_text(angle=65, vjust=0.6),
        text = element_text(size=10))

In [None]:
##TOP 3 most popular sconto for each age group

In [None]:
q5<-sqldf("select count(*) AS customers, sconto, age from an5
          group by sconto, age
          order by age, customers DESC")

q51<-sqldf("select * from q5 where age='7-19' limit 3")
q52<-sqldf("select * from q5 where age='20-35' limit 3")
q53<-sqldf("select * from q5 where age='35-50' limit 3")
q54<-sqldf("select * from q5 where age='50-65' limit 3")
q55<-sqldf("select * from q5 where age='65+' limit 3")
q6<-rbind(q51,q52,q53,q54,q55)

head(q6)

In [None]:
ggplot(q6, aes(fill=sconto, y=customers, x=age)) + 
  geom_bar(position="dodge", stat="identity")

In [None]:
##most popular payment method general
q7<-sqldf("select count(*) AS freq, tipo_pag from an5 group by tipo_pag 
          Order by freq  DESC")
head(q7)

In [None]:
ggplot(q7, aes(x = "", y = freq, fill = tipo_pag)) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0)+
  scale_fill_manual(values = mycols) + 
  labs(title="Payment type distribution")

In [None]:
##thus cash was the mostly used payment method, while bacomat
##was the second most popular

q8<-sqldf("select count(*) AS freq, tipo_pag,age from an5 
          group by tipo_pag, age 
          Order by age DESC, freq  DESC")
head(q8)

In [None]:
##thus this is payment method hierarchy is true for all payment methods
ggplot(q8, aes(fill=tipo_pag, y=freq, x=age)) + 
  geom_bar(position="dodge", stat="identity")

In [None]:
##checking population by commune
q9<- sqldf("select comune, count(*) as freq 
            from an5 
           group by comune order by freq DESC
           limit 10")
head(q9)

In [None]:
ggplot(q9, aes(x = comune, y = freq, fill = comune)) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  labs(title="Residential distribution")+
  theme(axis.text.x=element_text(angle=65, vjust=0.6),
        text = element_text(size=8))

In [None]:
##so maximum people belonging to torino, then missing data 

In [None]:
#now working on in13 file
in13$orai<-as.character(in13$orai)
in13$datai<-as.character(in13$datai)

In [None]:
#creating a new column of time 
in13$time <- paste(in13$datai, in13$orai, sep=' ')
str(in13$time)
in13$time1<-as.POSIXct(in13$time, format="%d/%m/%Y %H:%M")
in13$time<-NULL

In [None]:
#renaming codcliente to be similar to an13 file
in13$codcliente<-in13$CodCliente
in13$CodCliente<-NULL
in13$datai<-NULL
in13$orai<-NULL

In [None]:
#number of visits per user
i1<-   sqldf("select distinct codcliente, count(*) as visits from in13 
              group by codcliente
               order by visits DESC")

In [None]:
head(i1)

In [None]:
head(in13)

In [None]:
##checking the difference in rows from an5 and i1, we know 9000 people did not visit any place
#among those who visited, average number of visits per person
mean(i1$visits)
#7 visits per person

##most popular museums
i2<-   sqldf("select count(*) as freq, museo from in13 
              group by museo
               order by freq DESC
               limit 10")
head(i2)

In [None]:
##customer total fictional payment
i3<-  sqldf("select codcliente, count(*) as freq_visits, sum(importo) from in13 
             group by codcliente
             order by freq_visits DESC")
head(i3)
o3<-c("amount hypothetically to pay","amount paid")
o4<-c(mean(i3$`sum(importo)`),mean(an13$importo))
o5<-cbind(o3,o4)

In [None]:
#payment needed to be made to society
i4<-i3
i4$associationpayment<-i3$`sum(importo)`/2
i4$`sum(importo)`<-NULL
head(i4)

In [None]:
##find total associationpayment vs importo
o1<-sum(i4$associationpayment)
o2<-sum(an13$importo)
profit<- ((o2-o1)/o2)*100
profit

In [None]:
#now it is better to combine all these columns together
#we start by left join on an5 file which is the fundamental source of clientids
i5<-sqldf("select * from an5 as A,i4 as I 
         ON A.codcliente=I.codcliente")
i5<-i5[,-13]
head(i5)

In [None]:
##now do another join to concatenate data1 dataframe to know who churned and who did not
total<-sqldf("select * from i5 as A,data1 as D 
         ON A.codcliente=D.codcliente")
head(total)

In [None]:
total<-total[,c(2,3,4,5,6,7,8,9,10,11,12,13,15,16,19,20,21,22)]

head(total)

In [None]:
#group by client the visit data
x1<-sqldf("select * from i5 group by codcliente")
#find those clients who did not visit at all
x3<-setDT(an5)[!x1, on="codcliente"]
head(x3)

In [None]:
#find the churn information of clients who did not visit at all
x4<-sqldf("select * from x3 as A, data1 as I where
          A.codcliente=I.codcliente")
head(x4)

In [None]:
#removing unwanted columns
x4<-x4[,-1]
x4<-x4[,-12]

head(x4)

In [None]:
x4<-x4[,-13]
x4<-x4[,-13]

## Data preparation for Churn Prediction

In [None]:
k1<-x4
k1$freq_visits<-as.integer(0)
k1$associationpayment<-as.numeric(0)
k1<-k1[,c(-14,-15,-16)]
k1<-k1[,c(1:12,14,15,13)]
head(k1)

In [None]:
#removing last entry, 2013 subscription and 2014 subscription column
main<-total
main<-main[,c(-16,-17,-18)]

head(main)

In [None]:
##combining these two subsets of customers who visited and customers who did not visit
combined<-rbind(main,k1)

In [None]:
#checking and fixing structure of main
str(combined)
combined$si2014<-as.factor(combined$si2014)
##data set with customerid removed
##IMPORTANT CHECK POINT TO IDENTIFY CUSTOMERS AT THE END
train<-combined
head(train)

In [None]:
#removing client ID
train<-train[,-1]

#treating date
train$data_inizio<-as.character(train$data_inizio)

#creating a new column of time and shifting it to week 
train$startdate<-as.POSIXct(train$data_inizio, format="%d/%m/%Y %H:%M")
train$data_inizio<-NULL
train$weekstart<-format(as.Date(train$startdate), "%W")
train$startdate<-NULL
train$weekstart<-as.factor(train$weekstart)
str(train)
head(train)

#### CORRELATION ANALYSIS

In [None]:
train$importo<-as.numeric(train$importo)
train$freq_visits<-as.numeric(train$freq_visits)
numeric.var <- sapply(train, is.numeric)
corr.matrix <- cor(train[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")
cor(train$freq_visits,train$importo)

In [None]:
##thus from the correlation plot, we can say that freq_visits are 
##strongly correlated to associationpayment, while importo is not
##correlated to both, freq_visits and association payment
##so we select association payment and importo as numerical variables
train$freq_visits<-NULL



In [None]:
##Getting Churn column
str(train)
k5<-as.character(combined$si2014)
k5<-ifelse(k5=="0","Yes","No")
k5


In [None]:
k5<-as.factor(k5)
str(k5)
Churn<-k5
train$Churn<-Churn
train$si2014<-NULL
str(train)
train

In [None]:
##to deal with comune, agenzia, cap, weekstart, we take top values and label rest as others
an13

In [None]:
##somehow comune data got deleted, so readd that data
#comune<-sqldf("select A.comune from an13 as A, combined as C where
#          A.codcliente=C.codcliente")
#str(comune)
#train$comune<-as.character(comune)

In [None]:
set.seed(1)
str(train)
train$agenzia<-fct_lump(train$agenzia,n=4)
train$agenzia_tipo<-fct_lump(train$agenzia_tipo,n=4)
train$cap<-fct_lump(train$cap, n=4)
train$comune<-fct_lump(train$comune, n=2)
train$weekstart<-fct_lump(train$weekstart, n= 4)
train$sconto<-fct_lump(train$sconto, n=4)
train$riduzione<-fct_lump(train$riduzione, n=4)

In [None]:
str(train)

In [None]:
train

#### Gradient Boosting Model 

In [None]:
#####gbm
# dummy variables for factors/characters
train$Churn<-as.integer(train$Churn)
View(train)

In [None]:
str(train)

In [None]:
dummy <- dummyVars("~.",data=train, fullRank=F)
dv1 <- as.data.frame(predict(dummy,train))
print(names(dv1))

prop.table(table(dv1$Churn))

In [None]:
# save the outcome for the glmnet model
tempOutcome <- dv1$Churn  

In [None]:
# generalize outcome and predictor variables
outcomeName <- 'Churn'
predictorsNames <- names(dv1)[names(dv1) != outcomeName]

In [None]:
outcomeName
predictorsNames

In [None]:
#################################################
# model it
#################################################
# get names of all caret supported models
names(getModelInfo())
str(dv1$Churn)

In [None]:
l <- as.character(dv1$Churn)
l<-(ifelse(l=="2","Yes","No"))
l<-as.factor(l)

In [None]:
l

In [None]:
dv1$Churn<-l
str(dv1)

In [None]:
###########################################################
#temporarily adding codice cliente to remove after split
dv1$codcliente<-combined$codcliente
clientid<-dv1$codcliente

In [None]:
# pick model gbm and find out what type of model it is
getModelInfo()$gbm$type

In [None]:
# split data into training and testing chunks
set.seed(1234)
splitIndex <- createDataPartition(dv1[,outcomeName], p = .75, list = FALSE, times = 1)
trainDF <- dv1[ splitIndex,]
testDF  <- dv1[-splitIndex,]
dim(trainDF)
dim(testDF)

In [None]:
#removing and storing clientids of both separately
cidtrain<-trainDF$codcliente
cidtest<-testDF$codcliente
trainDF$codcliente<-NULL
testDF$codcliente<-NULL

In [None]:
# create caret trainControl object to control the number of cross-validations performed
objControl <- trainControl(method='cv', number=3, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE)

In [None]:
# run model
str(trainDF)

In [None]:
objModel <- caret::train(trainDF[,predictorsNames], as.factor(trainDF[,outcomeName]), 
                         method='gbm', 
                         trControl=objControl,  
                         metric = "ROC",
                         preProc = c("center", "scale"))

In [None]:
# find out variable importance
summary(objModel)

In [None]:
# find out model details
##gbm takes interaction depth, no. of trees and shrinkage.
##best tuning parameter 
objModel

In [None]:
trainDF

In [None]:
trainDF$associationpayment

In [None]:
#class prediction
predictions <- predict(object=objModel, testDF[,predictorsNames], type='raw')
head(predictions)
print(postResample(pred=predictions, obs=as.factor(testDF[,outcomeName])))

In [None]:
# probabilites 
library(pROC)
predictions <- predict(object=objModel, testDF[,predictorsNames], type='prob')
head(predictions)

In [None]:
postResample(pred=predictions[[2]],
             obs=ifelse(testDF[,outcomeName]=='yes',1,0))
str(testDF$Churn)
auc <- roc(ifelse(testDF[,outcomeName]=="Yes",1,0), predictions[[2]])
print(auc$auc)

##### Selecting top 5000 customers to contact based on prediction

In [None]:
testDF$clientID<-cidtest
testDF$prob_churn<-predictions$Yes
testDF

In [None]:
testDF$profit<-testDF$importo-testDF$associationpayment

In [None]:
# CUSTOMERS TO CONTACT FOR THE CAMPAIGN
customers_to_contact<-sqldf("select clientID,profit,prob_churn from testDF order by profit DESC
                            limit 5000")

In [None]:
customers_to_contact

In [None]:
#profit=importo-associationpayment
#View(data1)
#minimization<-cbind(predictions$Yes)

In [None]:
################################################
# glmnet model
################################################

# pick model gbm and find out what type of model it is
getModelInfo()$glmnet$type

In [None]:
# save the outcome for the glmnet model
str(dv1$Churn)
dv1$Churn<-l
str(dv1)

In [None]:
#temporarily adding codice cliente to remove after split
dv1$codcliente<-combined$codcliente

In [None]:
# split data into training and testing chunks
set.seed(1234)
splitIndex <- createDataPartition(dv1[,outcomeName], p = .75, list = FALSE, times = 1)
trainDF <- dv1[ splitIndex,]
testDF  <- dv1[-splitIndex,]

In [None]:
#removing and storing clientids of both separately
cidtrain<-trainDF$codcliente
cidtest<-testDF$codcliente
trainDF$codcliente<-NULL
testDF$codcliente<-NULL

In [None]:
# pick model glm and find out what type of model it is
str(trainDF)
LR <- glm(Churn~.,family=binomial(link="logit"),data=trainDF)

print(summary(LR))
LR
anova(LR, test="Chisq")

In [None]:
#assessing the ability of model
test$Churn <- as.character(test$Churn)
test$Churn[test$Churn=="No"] <- "0"
test$Churn[test$Churn=="Yes"] <- "1"
fitted.results <- predict(LR,newdata=testDF,type='response')
fitted.results

In [None]:
cutoffs <- seq(0.1,0.9,0.1)
accuracy <- NULL
for (i in seq(along = cutoffs)){
  prediction <- ifelse(fitted.results >= cutoffs[i], 1, 0) #Predicting for cut-off
  accuracy <- c(accuracy,length(which(testDF$Churn ==prediction))/length(prediction)*100)
}

In [None]:
plot(cutoffs, accuracy, pch =19,type='b',col= "steelblue",
     main ="Logistic Regression", xlab="Cutoff Level", ylab = "Accuracy %")

In [None]:
######################################################
###glm without dummy variables

In [None]:
train

In [None]:
str(train)
p1<-as.factor(train$Churn)
train$Churn<-p1
p2<-train

In [None]:
#temporarily adding codice cliente to remove after split
p2$codcliente<-combined$codcliente

In [None]:
# split data into training and testing chunks
set.seed(1234)
splitIndex <- createDataPartition(p2[,outcomeName], p = .75, list = FALSE, times = 1)
glmtrain <- p2[ splitIndex,]
glmtest  <- p2[-splitIndex,]

In [None]:
#removing and storing clientids of both separately
cidtrain<-glmtrain$codcliente
cidtest<-glmtest$codcliente
glmtrain$codcliente<-NULL
glmtest$codcliente<-NULL

In [None]:
# pick model glm and find out what type of model it is
str(glmtrain)
LR <- glm(Churn~.,family=binomial(link="logit"),data=glmtrain)

print(summary(LR))

In [None]:
LR

In [None]:
anova(LR, test="Chisq")

In [None]:
#assessing the ability of model
fitted.results <- predict(LR,newdata=glmtest,type='response')
View(fitted.results)

In [None]:
cutoffs <- seq(0.1,0.9,0.1)
accuracy <- NULL
for (i in seq(along = cutoffs)){
  prediction <- ifelse(fitted.results >= cutoffs[i], 1, 0) #Predicting for cut-off
  accuracy <- c(accuracy,length(which(glmtest$Churn ==prediction))/length(prediction)*100)
}

In [None]:
plot(cutoffs, accuracy, pch =19,type='b',col= "steelblue",
     main ="Logistic Regression", xlab="Cutoff Level", ylab = "Accuracy %")

In [None]:
################################################################
#################EVTREE
### without dummy variables

train

In [None]:
str(train)

In [None]:
#run following line if Churn not in factor form
#p1<-as.factor(train$Churn)
#train$Churn<-p1
p3<-train

In [None]:
#temporarily adding codice cliente to remove after split
p3$codcliente<-combined$codcliente

In [None]:
# split data into training and testing chunks
set.seed(1234)
splitIndex <- createDataPartition(p3[,outcomeName], p = .75, list = FALSE, times = 1)
evtrain <- p3[ splitIndex,]
evtest  <- p3[-splitIndex,]

In [None]:
#removing and storing clientids of both separately
cidtrain<-evtrain$codcliente
cidtest<-evtest$codcliente
evtrain$codcliente<-NULL
evtest$codcliente<-NULL


In [None]:
# pick model glm and find out what type of model it is
str(evtrain)

In [None]:
evt<-evtree(Churn~.,data=evtrain)
#plot(evt)
#predict(evt,evtest)
#table(trainDF[])
#View(train)
#levels(train$cap)
#str(train)

an13

In [None]:
#################################random forest model
library(randomForest)
rfm <- randomForest(Churn ~., data = tr)
print(rfm)
rfm

In [None]:
#predition accurary test and confusion matrix
rfpred <- predict(rfm, test)
str(rfpred)

In [None]:
str(n)

In [None]:
n<-as.factor(test$Churn)
n<-as.factor(mapvalues(n,
                       from=c("0","1"),
                       to=c("No", "Yes")))

In [None]:
caret::confusionMatrix(rfpred, n)

In [None]:
plot(rfm)

In [None]:
#####################kNN
kNN(an13, variable=c("sesso"), k=6)
str(p3)
p4<-p3$Churn
p3$Churn<-as.integer(p3$Churn)

In [None]:
dummy <- dummyVars("~.",data=p3, fullRank=F)
dv2 <- as.data.frame(predict(dummy,p3))
print(names(dv2))

In [None]:
#readd churn as factor
dv2$Churn<-p4
#temporarily adding codice cliente to remove after split
#p3$codcliente<-combined$codcliente
# split data into training and testing chunks

In [None]:
set.seed(1234)
splitIndex <- createDataPartition(dv2[,outcomeName], p = .75, list = FALSE, times = 1)
knntrain <- dv2[ splitIndex,]
knntest  <- dv2[-splitIndex,]

In [None]:
#removing and storing clientids of both separately
cidtrain<-knntrain$codcliente
cidtest<-knntest$codcliente
knntrain$codcliente<-NULL
knntest$codcliente<-NULL

In [None]:
# pick model knn and find out what type of model it is
str(knntrain)
trControl2<-trainControl(method="repeatedcv",number=10,repeats=3)
knnfit<-caret::train(Churn ~.,data=knntrain,method='knn',tuneLength=20,
                     trControl=trControl2, preProc=c("center","scale"))