In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# ** What do we plan to do here? **
* Read the dataset
* Analyze the data for missing values and outliers
* Perform uni-variate analysis
* Perform bi-variate analysis
* We will not do any feature engineering in this particular problem
* We will create a lot of visualizations to do a thorough analysis of the problem at hand
* Pick a list of algorithms which we can choose to apply in this case
* Pick the best algorithm
* Score the algorithm based on the evaluation criteria
* Fine tune algorithms to achieve the best possible value of the evaluation metric

In [None]:
### Q1) Read the dataset into the notebook
cfd=pd.read_csv("../input/creditcard.csv")
cfd.head()

In [None]:
cfd.info()

In [None]:
### Q2) Print the shape of the data
cfd.shape

In [None]:
### Q3) List out the feature variables and their data-types
cfd.iloc[:,:30].info()

### Q6) Treat the null variables. What is your strategy? Why did you use that? What other strategies could be taken? Explain
There are no null variables in this data set and if there were any first we have to check the number of null variables and if possible or a particular feature is important it must be replaced with mean or median values and this can be done only if the number of missing values  are less. If the numbers are very high then those values can be removed using dropna() method.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
### Q7) Check for outliers in the feature variables
fig= plt.figure(figsize=(15,100))
cols = cfd.columns

fig.subplots_adjust(hspace=0.5, wspace=0.4)
for i in range(1,30):
    ax = fig.add_subplot(10,3,i)
    sns.boxplot(x=cfd[cols[i-1]])
    plt.title(cols[i-1])

### Q8) Treat outliers. What is your strategy?

In [None]:
cfd.iloc[:,:30].describe()

In [None]:
q1 = cfd.iloc[:,:30].quantile(0.25)
q3 = cfd.iloc[:,:30].quantile(0.75)


In [None]:
q1

In [None]:
q3

In [None]:
iqr = q3 - q1

In [None]:
iqr

In [None]:
lower_bound = q1 -(1.5 * iqr) 
upper_bound = q3 +(1.5 * iqr) 

In [None]:
lower_bound

In [None]:
upper_bound

In [None]:
print(((cfd< (q1 - 1.5 * iqr)) |(cfd > (q3 + 1.5 * iqr))))

In [None]:
cfd1 = cfd[~((cfd< (q1 - 1.5 * iqr)) |(cfd > (q3 + 1.5 * iqr))).any(axis=1)]
cfd1.shape

In [None]:
cfd1.info()

### Q9) Pick each one of the feature variables and perform univariate analysis (be as creative as possible in your analysis)
* ### Q9.1) Visualize the shape of the distribution of data.Is every feature variable normally distributed? Why is normal distribution important for data?
* ### Q9.2) Is the data distribution skewed? If highly skewed,do you still find outliers which you did not treat?
* ### Q9.3) Draw box and whiskers plot of each of the feature variables
* ### Q9.4) How do the distributions look in terms of variation? Which features are widely spread and which are kind of concentrated towards the mean?

In [None]:
cfd1.hist(figsize=(20,20))
plt.show()

In [None]:
fig= plt.figure(figsize=(15,100))
cols = cfd1.columns

fig.subplots_adjust(hspace=0.5, wspace=0.4)
for i in range(1,30):
    ax = fig.add_subplot(10,3,i)
    sns.boxplot(x=cfd1[cols[i-1]])
    plt.title(cols[i-1])

### Q10) Pick the feature variables and perform bi-variate analysis (be as creative as possible)
* ### Q10.1) Try creating correlation matrices. See if there are variables which are strongly or weakly related
* ### Q10.2) Try build joint distribution charts
* ### Q10.3) If there are variables showing high correlation, what corrective action is needed? Why is this a matter of concern? What if we do not treat the variables showing high degree of correlation?

In [None]:
cfd2=cfd1.corr()
cfd2

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(cfd2)

### Q11.1) What is the type of machine learning problem at hand? (Supervised or Unsupervised?) Why?
### Q11.2) What is the category of the machine learning problem at hand? (Classification or Regression?) Why?

### Q12.1) Draw univariate plots for each of the feature variables, color each plotted point as red if the class value = 0 else green.
### Q12.2) Which feature segregates the data the cleanest way? How would you calculate the misclassification rate?
### Q12.3) Now take two features at a time, again color each plotted point as mentioned in 12.1. Calculate and comment on the misclassification rate?

### Q13.1) List down all the algorithms known to you which you think might be applicable in this case?

### Q14) Pick each of the algorithm and perform the below steps : 
### Q14.1) Split your data between test, train and validation steps. Why 3 and not just test and train? 
### Q14.2) Build your model
### Q14.3) List down the evaluation metrics you would use to evaluate the performance of the model?
### Q14.4) Evaluate the model on training data
### Q14.5) Predict the response variables for the validation test data
### Q14.6) Evaluate the model on test data
### Q14.7) How are the two scores? Are they significantly different? Are they the same? Is the test score better than training score?

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = cfd1.iloc[:,:30]
y = cfd1['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
nan