In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
import seaborn as sns
import scipy.stats as stats
from scipy.stats.mstats import winsorize
from datetime import datetime
import json
from wordcloud import WordCloud

%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df = df.drop('Cabin', axis=1)

In [5]:
df.Age = df.Age.fillna(df.Age.mean())
df.Embarked = df.Embarked.fillna(df.Embarked.mode())

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [7]:
df1 = pd.get_dummies(df,columns=['Sex','Embarked'],drop_first=True)

In [8]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [9]:
df1.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [10]:
Y = df1["Survived"]
X = df1[["Pclass","Age","SibSp","Parch","Fare","Sex_male","Embarked_Q","Embarked_S",'PassengerId']]

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.20, random_state=111)

In [12]:
log_reg.fit(X_train, Y_train)

LogisticRegression()

In [13]:
train_accuracy = log_reg.score(X_train, Y_train)
test_accuracy = log_reg.score(X_test, Y_test)
print('One-vs-rest', '-'*20, 
      'Model accuracy on train data : {:.2f}'.format(train_accuracy), 
      'Model accuracy on test data   : {:.2f}'.format(test_accuracy), sep='\n')

One-vs-rest
--------------------
Model accuracy on train data : 0.79
Model accuracy on test data   : 0.78


In [14]:
log_reg_mnm = LogisticRegression(multi_class='multinomial', solver='lbfgs')
log_reg_mnm.fit(X_train, Y_train)
train_accuracy = log_reg_mnm.score(X_train, Y_train)
test_accuracy = log_reg_mnm.score(X_test, Y_test)
print('Multinomial (Softmax)', '-'*20, 
      'Model accuracy on train data : {:.2f}'.format(train_accuracy), 
      'Model accuracy on test data   : {:.2f}'.format(test_accuracy), sep='\n')

Multinomial (Softmax)
--------------------
Model accuracy on train data : 0.79
Model accuracy on test data   : 0.79


In [15]:
predictions = log_reg.predict(X_test)

In [16]:
survive = {'PassengerId': X_test.PassengerId, 'Survive': predictions}

In [17]:
table = pd.DataFrame(survive)
table

Unnamed: 0,PassengerId,Survive
374,375,1
211,212,1
258,259,1
584,585,0
461,462,0
...,...,...
739,740,0
323,324,1
395,396,0
386,387,0


In [18]:
C_değerleri = [0.001,0.01,0.1,1,10,100, 1000]
dogruluk_df = pd.DataFrame(columns = ['C_Değeri','Doğruluk'])

dogruluk_değerleri = pd.DataFrame(columns=['C Değeri', 'Eğitim Doğruluğu', 'Test Doğruluğu'])

for c in C_değerleri:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l2', C = c, random_state = 0)
    lr.fit(X_train,Y_train)
    dogruluk_değerleri = dogruluk_değerleri.append({'C Değeri': c,
                                                    'Eğitim Doğruluğu' : lr.score(X_train,Y_train),
                                                    'Test Doğruluğu': lr.score(X_test, Y_test)
                                                    }, ignore_index=True)
display(dogruluk_değerleri)    

Unnamed: 0,C Değeri,Eğitim Doğruluğu,Test Doğruluğu
0,0.0,0.69,0.66
1,0.01,0.73,0.73
2,0.1,0.8,0.79
3,1.0,0.79,0.78
4,10.0,0.79,0.78
5,100.0,0.79,0.77
6,1000.0,0.79,0.77


C degeri 0.10 iken modelin performansı daha iyi

## K FOLD CROSS VALIDATION

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print("Tüm veri kümesi '0' yüzdesi : %{:.0f} ".format(len(Y[Y==0])/len(Y)*100))
print("Test verisi '0' yüzdesi     : %{:.0f} ".format(len(Y_test[Y_test==0])/len(Y_test)*100))
print("Eğitim verisi '0' yüzdesi   : %{:.0f} ".format(len(Y_train[Y_train==0])/len(Y_train)*100))

Tüm veri kümesi '0' yüzdesi : %62 
Test verisi '0' yüzdesi     : %59 
Eğitim verisi '0' yüzdesi   : %62 


In [20]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y)

print("Tüm veri kümesi '0' yüzdesi : %{:.0f} ".format(len(Y[Y==0])/len(Y)*100))
print("Test verisi '0' yüzdesi     : %{:.0f} ".format(len(y_test[y_test==0])/len(y_test)*100))
print("Eğitim verisi '0' yüzdesi   : %{:.0f} ".format(len(y_train[y_train==0])/len(y_train)*100))

Tüm veri kümesi '0' yüzdesi : %62 
Test verisi '0' yüzdesi     : %61 
Eğitim verisi '0' yüzdesi   : %62 


##burada strarify kullanmama ragmen neden sonucları aynı alıyorum?

In [21]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1111)

In [22]:
parcalar = kf.split(X)

for num, (eğitim_index, test_index) in enumerate(parcalar):
    print("{}.Eğitim Kümesi Boyutu : {}".format(num+1,len(eğitim_index)))
    print("{}.Test Kümesi Boyutu   : {}".format(num+1,len(test_index)))
    print('-'*27)

1.Eğitim Kümesi Boyutu : 712
1.Test Kümesi Boyutu   : 179
---------------------------
2.Eğitim Kümesi Boyutu : 713
2.Test Kümesi Boyutu   : 178
---------------------------
3.Eğitim Kümesi Boyutu : 713
3.Test Kümesi Boyutu   : 178
---------------------------
4.Eğitim Kümesi Boyutu : 713
4.Test Kümesi Boyutu   : 178
---------------------------
5.Eğitim Kümesi Boyutu : 713
5.Test Kümesi Boyutu   : 178
---------------------------


In [23]:
from sklearn.model_selection import cross_validate, cross_val_score


In [24]:

lrm = LogisticRegression()
cv = cross_validate(estimator=lrm,
                     X=X,
                     y=Y,
                     cv=10,return_train_score = True
                    ) 
print('Test Skorları            : ', cv['test_score'], sep = '\n')
print("-"*50)
print('Eğitim Skorları          : ', cv['train_score'], sep = '\n')

Test Skorları            : 
[0.78888889 0.82022472 0.75280899 0.82022472 0.7752809  0.76404494
 0.79775281 0.75280899 0.82022472 0.7752809 ]
--------------------------------------------------
Eğitim Skorları          : 
[0.79650437 0.78802993 0.79426434 0.79177057 0.80174564 0.79177057
 0.79426434 0.78802993 0.78428928 0.78802993]


In [25]:
cv

{'fit_time': array([0.05700517, 0.03525114, 0.02627492, 0.02607799, 0.02586198,
        0.02975392, 0.02772903, 0.03105283, 0.02749586, 0.02664208]),
 'score_time': array([0.00290489, 0.0029788 , 0.00186706, 0.00199008, 0.00187421,
        0.00228214, 0.00180697, 0.00227094, 0.00236201, 0.00192618]),
 'test_score': array([0.78888889, 0.82022472, 0.75280899, 0.82022472, 0.7752809 ,
        0.76404494, 0.79775281, 0.75280899, 0.82022472, 0.7752809 ]),
 'train_score': array([0.79650437, 0.78802993, 0.79426434, 0.79177057, 0.80174564,
        0.79177057, 0.79426434, 0.78802993, 0.78428928, 0.78802993])}

## burada nede egitim datasının skorlarını alamadım?

In [26]:
print('Test Kümesi   Ortalaması : ', cv['test_score'].mean())
print('Eğitim Kümesi Ortalaması : ', cv['train_score'].mean())

Test Kümesi   Ortalaması :  0.7867540574282147
Eğitim Kümesi Ortalaması :  0.791869888325379


In [29]:
cv = cross_validate(estimator=lrm, 
                     X=X,
                     y=Y,
                     cv=10, return_train_score = True,
                     scoring = ['accuracy', 'precision', 'r2']
                    )

In [30]:
print('Test Kümesi Doğruluk Ortalaması     : {:.2f}'.format(cv['test_accuracy'].mean()))
print('Test Kümesi R-kare  Ortalaması      : {:.2f}'.format(cv['test_r2'].mean()))
print('Test Kümesi Hassasiyet Ortalaması   : {:.2f}'.format(cv['test_precision'].mean()))
print('Eğitim Kümesi Doğruluk Ortalaması   : {:.2f}'.format(cv['train_accuracy'].mean()))
print('Eğitim Kümesi R-kare  Ortalaması    : {:.2f}'.format(cv['train_r2'].mean()))
print('Eğitim Kümesi Hassasiyet Ortalaması : {:.2f}'.format(cv['train_precision'].mean()))

Test Kümesi Doğruluk Ortalaması     : 0.79
Test Kümesi R-kare  Ortalaması      : 0.10
Test Kümesi Hassasiyet Ortalaması   : 0.75
Eğitim Kümesi Doğruluk Ortalaması   : 0.79
Eğitim Kümesi R-kare  Ortalaması    : 0.12
Eğitim Kümesi Hassasiyet Ortalaması : 0.76


In [33]:
cv = cross_val_score(estimator=lrm,
                     X=X,
                     y=Y,
                     cv=10                    
                    )
print('Model Skorları            : ', cv, sep = '\n')

Model Skorları            : 
[0.78888889 0.82022472 0.75280899 0.82022472 0.7752809  0.76404494
 0.79775281 0.75280899 0.82022472 0.7752809 ]


## Hiperparemetre Ayarlaması

In [31]:
logreg = LogisticRegression()
print(logreg.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [32]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
parametreler = {"C": [10 ** x for x in range (-5, 5, 1)],
                "penalty": ['l1', 'l2']
                }
from sklearn.model_selection import GridSearchCV

grid_cv = GridSearchCV(estimator=logreg,
                       param_grid = parametreler,
                       cv = 10
                      )
grid_cv.fit(X, Y)

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000, 10000],
                         'penalty': ['l1', 'l2']})

In [36]:
print("En iyi parametreler : ", grid_cv.best_params_)
print("En iyi skor         : ", grid_cv.best_score_)

En iyi parametreler :  {'C': 0.1, 'penalty': 'l2'}
En iyi skor         :  0.7890886392009987


In [37]:
import pandas as pd

sonuçlar = grid_cv.cv_results_
df = pd.DataFrame(sonuçlar)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0,0.0,0.0,0.0,0.0,l1,"{'C': 1e-05, 'penalty': 'l1'}",,,,,,,,,,,,,20
1,0.02,0.01,0.0,0.0,0.0,l2,"{'C': 1e-05, 'penalty': 'l2'}",0.59,0.58,0.7,0.75,0.64,0.65,0.64,0.66,0.71,0.64,0.66,0.05,10
2,0.0,0.0,0.0,0.0,0.0,l1,"{'C': 0.0001, 'penalty': 'l1'}",,,,,,,,,,,,,16
3,0.02,0.0,0.0,0.0,0.0,l2,"{'C': 0.0001, 'penalty': 'l2'}",0.61,0.6,0.69,0.73,0.66,0.64,0.66,0.67,0.69,0.66,0.66,0.04,9
4,0.0,0.0,0.0,0.0,0.0,l1,"{'C': 0.001, 'penalty': 'l1'}",,,,,,,,,,,,,19


In [38]:
df = df[['param_penalty','param_C', 'mean_test_score']]
df = df.sort_values(by='mean_test_score', ascending = False)
df

Unnamed: 0,param_penalty,param_C,mean_test_score
9,l2,0.1,0.79
15,l2,100.0,0.79
11,l2,1.0,0.79
13,l2,10.0,0.79
19,l2,10000.0,0.78
17,l2,1000.0,0.78
7,l2,0.01,0.74
5,l2,0.0,0.68
3,l2,0.0,0.66
1,l2,0.0,0.66
