In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [2]:
# Loading data set
claims=pd.read_csv("E:\\data science assigment\\video study\\Claims.csv")
claims.head()

Unnamed: 0.1,Unnamed: 0,Region,State,Area,City,Consumer_profile,Product_category,Product_type,AC_1001_Issue,AC_1002_Issue,...,TV_2001_Issue,TV_2002_Issue,TV_2003_Issue,Claim_Value,Service_Centre,Product_Age,Purchased_from,Call_details,Purpose,Fraud
0,1,South,Karnataka,Urban,Bangalore,Business,Entertainment,TV,0,0,...,1,2,0,15000.0,10,60,Manufacturer,0.5,Complaint,1
1,2,South,Karnataka,Rural,Bangalore,Business,Household,AC,1,1,...,0,0,0,20000.0,12,10,Dealer,1.0,Complaint,0
2,3,North,Haryana,Urban,Chandigarh,Personal,Household,AC,0,1,...,0,0,0,18000.0,14,10,Dealer,1.4,Claim,0
3,4,South,Tamilnadu,Urban,Chennai,Business,Entertainment,TV,0,0,...,1,1,0,12000.0,16,20,Manufacturer,2.0,Complaint,0
4,5,North East,Jharkhand,Rural,Ranchi,Personal,Entertainment,TV,0,0,...,0,1,2,25000.0,15,6,Dealer,1.3,Claim,0


In [4]:
#### DATA CLEANSING ####

## Region correction according to states
claims.loc[(claims.State == "Delhi") | (claims.State == "Uttar Pradesh") |(claims.State == "UP") |
        (claims.State == "Haryana") | (claims.State == "HP") | (claims.State == "J&K"), "Region"] = "North"

claims.loc[(claims.State == "Andhra Pradesh") | (claims.State == "Karnataka") |
        (claims.State == "Kerala") | (claims.State == "MP") | (claims.State == "Tamilnadu") | 
        (claims.State == "Telengana"), "Region"] = "South"

claims.loc[(claims.State == "Assam") | (claims.State == "Jharkhand") |
        (claims.State == "Tripura") | (claims.State == "West Bengal"), "Region"] = "East"

claims.loc[(claims.State == "Gujarat"), "Region"] = "West"
        
claims.loc[(claims.State == "Bihar") | (claims.State == "UP") | (claims.State == "Uttar Pradesh"), "Region"] = "North East"

claims.loc[(claims.State == "Goa") | (claims.State == "Maharashtra"), "Region"] = "South West"

claims.loc[(claims.State == "Odisha"), "Region"] = "South East"
        
claims.loc[(claims.State == "Rajasthan"), "Region"] = "North West"     


In [5]:
## Replacing UP with Uttar Pradesh 
claims.loc[(claims.State == "UP"), "State"] = "Uttar Pradesh"

In [6]:
## Replacing claim with Claim
claims.loc[(claims.Purpose == "claim"), "Purpose"] = "Claim"


In [7]:
## Separating hyderbad among two states. like Andhra Pradesh = Hyderbad, Telengana = Hyderabad 1
claims.loc[(claims.State == "Telengana"), "City"] = "Hyderabad 1"


In [8]:
# Deleting first column
claims.drop(["Unnamed: 0"],inplace=True,axis=1) 

In [9]:
#### EXPLORATORY DATA ANALYSIS ####
list(claims.columns)

['Region',
 'State',
 'Area',
 'City',
 'Consumer_profile',
 'Product_category',
 'Product_type',
 'AC_1001_Issue',
 'AC_1002_Issue',
 'AC_1003_Issue',
 'TV_2001_Issue',
 'TV_2002_Issue',
 'TV_2003_Issue',
 'Claim_Value',
 'Service_Centre',
 'Product_Age',
 'Purchased_from',
 'Call_details',
 'Purpose',
 'Fraud']

In [10]:
claims.shape[False]

11917

In [11]:
claims.duplicated(subset=None, keep='first').sum()

11559

In [12]:
claims1=claims.drop_duplicates(keep="first")

In [13]:
claims1.shape

(358, 20)

In [14]:
## filling NA values
claims1.isnull().sum()

Region              0
State               0
Area                0
City                0
Consumer_profile    0
Product_category    0
Product_type        0
AC_1001_Issue       0
AC_1002_Issue       0
AC_1003_Issue       0
TV_2001_Issue       0
TV_2002_Issue       0
TV_2003_Issue       0
Claim_Value         9
Service_Centre      0
Product_Age         0
Purchased_from      0
Call_details        0
Purpose             0
Fraud               0
dtype: int64

In [15]:
claims1["Claim_Value"].fillna(7370,inplace=True)  ##median of claim value is 7370 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [16]:
claims1.isnull().sum()

Region              0
State               0
Area                0
City                0
Consumer_profile    0
Product_category    0
Product_type        0
AC_1001_Issue       0
AC_1002_Issue       0
AC_1003_Issue       0
TV_2001_Issue       0
TV_2002_Issue       0
TV_2003_Issue       0
Claim_Value         0
Service_Centre      0
Product_Age         0
Purchased_from      0
Call_details        0
Purpose             0
Fraud               0
dtype: int64

In [17]:
## creating dummies for categorical variables
dummies = pd.get_dummies(claims1[['Region','State','Area','City','Consumer_profile','Product_category','Product_type',
                                  'Purchased_from','Purpose']])


In [18]:
# Dropping the columns for which we have created dummies
claims1.drop(['Region','State','Area','City','Consumer_profile','Product_category','Product_type',
             'Purchased_from','Purpose'],inplace=True,axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [19]:
# adding the columns to the salary data frame 
claims2 = pd.concat([claims1,dummies],axis=1)


In [20]:
claims2.head(3)

Unnamed: 0,AC_1001_Issue,AC_1002_Issue,AC_1003_Issue,TV_2001_Issue,TV_2002_Issue,TV_2003_Issue,Claim_Value,Service_Centre,Product_Age,Call_details,...,Product_category_Entertainment,Product_category_Household,Product_type_AC,Product_type_TV,Purchased_from_Dealer,Purchased_from_Internet,Purchased_from_Manufacturer,Purpose_Claim,Purpose_Complaint,Purpose_Other
0,0,0,0,1,2,0,15000.0,10,60,0.5,...,1,0,0,1,0,0,1,0,1,0
1,1,1,0,0,0,0,20000.0,12,10,1.0,...,0,1,1,0,1,0,0,0,1,0
2,0,1,2,0,0,0,18000.0,14,10,1.4,...,0,1,1,0,1,0,0,1,0,0


In [21]:
claims2.shape

(358, 82)

In [22]:
claims2['Fraud'].value_counts()

0    323
1     35
Name: Fraud, dtype: int64

In [23]:
# Separate majority and minority classes
claims2_majority = claims2[claims2.Fraud==0]
claims2_minority = claims2[claims2.Fraud==1]
 


In [24]:
	
from sklearn.utils import resample

In [25]:
# Upsample minority class
claims2_minority_upsampled = resample(claims2_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=323,    # to match majority class
                                 random_state=123) # reproducible results
 


In [26]:
# Downsample majority class
#claims2_majority_downsampled = resample(claims2_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=35,     # to match minority class
                                 random_state=123) # reproducible results

IndentationError: unexpected indent (<ipython-input-26-3f3e82acb861>, line 3)

In [27]:
# Combine majority class with upsampled minority class
claims2_upsampled = pd.concat([claims2_majority, claims2_minority_upsampled])
 


In [28]:
# Combine minority class with downsampled majority class
#claims2_downsampled = pd.concat([claims2_majority_downsampled, claims2_minority])
 


In [29]:
# Display new class counts
claims2_upsampled.Fraud.value_counts()


1    323
0    323
Name: Fraud, dtype: int64

In [30]:
# Display new class counts
#claims2_downsampled.Fraud.value_counts()

In [31]:
X = claims2_upsampled.iloc[:,[0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81]]
y = claims2_upsampled.iloc[:,10]

In [32]:
feature_cols =X

In [33]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 70% training and 30% test

In [34]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()



In [35]:
# Create Decision Tree classifer object
#clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)



In [36]:
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)



In [37]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [38]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9230769230769231


In [39]:
y_pred1 = clf.predict(X_train)

In [40]:
print("Accuracy:",metrics.accuracy_score(y_train,y_pred1))

Accuracy: 0.9786821705426356


In [41]:
print ('Recall:', metrics.recall_score(y_test, y_pred))

Recall: 1.0


In [42]:
print ('Precision:', metrics.precision_score(y_test,y_pred))
                                    


Precision: 0.8809523809523809


In [43]:
print ('\n clasification report:\n', metrics.classification_report(y_test,y_pred))



 clasification report:
               precision    recall  f1-score   support

           0       1.00      0.82      0.90        56
           1       0.88      1.00      0.94        74

    accuracy                           0.92       130
   macro avg       0.94      0.91      0.92       130
weighted avg       0.93      0.92      0.92       130



In [44]:
print ('\n confussion matrix:\n',metrics.confusion_matrix(y_test,y_pred))


 confussion matrix:
 [[46 10]
 [ 0 74]]


ValueError: Length of feature_names, 358 does not match number of features, 81