### --- import the libraries

In [48]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import model_selection ,impute ,preprocessing  ,svm , feature_selection ,metrics
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import LocalOutlierFactor 
from tqdm import tqdm




###  Read Data From directory 

In [49]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [50]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [51]:
train_data.shape

(891, 12)

In [52]:
# drooping un nessery columns 
train_data.drop(columns=['Name','PassengerId','Ticket' ,'Cabin'] , inplace=True)
test_data.drop(columns=['Name','PassengerId','Ticket','Cabin' ] , inplace=True)

### Function to convert columns from categrocal to numercal 
#### just give it the data 

In [62]:
def convert(  data , encoder = preprocessing.LabelEncoder()) :
    categrocal_col = data.select_dtypes('object')
    
    for i in range (len(categrocal_col.columns)):
        data[categrocal_col.columns[i]] = encoder.fit_transform(data[categrocal_col.columns[i]])
    return data

### Function to handle Missing Value using imputer Model 
#### Also just pass the data 

In [None]:
def handle_missing(data,n_neighbors =5  ):
    columns_name = data.columns
    imp = impute.KNNImputer(n_neighbors=5, weights="uniform")
    imp.fit(data)
    data = imp.transform(data)
    data = pd.DataFrame(data,columns=columns_name)
    return data


### Function to balance your data 
#### pass the feutures(x) and the targer (y) and it will return x and y with balanced classes 
#### and if the `scala_feutures` == True it also will return x_norm which is the scaled Feutures 

In [102]:
def balance_data (x,y , scala_feutures = True  , only_scaling = False):
    x, y = SMOTE().fit_resample(x,y) 
    if scala_feutures:
        x_normalizer = preprocessing.StandardScaler()
        x_norm = x_normalizer.fit_transform(x)
    if only_scaling :
        x_normalizer = preprocessing.StandardScaler()
        x_norm = x_normalizer.fit_transform(x)
        return x_norm , y 
        exit(0)
        
    
        
    return x,y ,x_norm
    

### Function to detect and handel the outlier in the data 
##### it required the fetures (x) only 
##### and if you want the `outlier` data set the parameter `return_outlier` to True  
##### it will return the fetures withot outlier 

In [56]:
def detect_outlier(fetures,contamination=0.1 , return_outlier=False) :
    clf = LocalOutlierFactor(n_neighbors=20, contamination=contamination )
    y_pred = clf.fit_predict(fetures)
    inliers = fetures[y_pred == 1]    
    outliers = fetures[y_pred == -1]
    
    fetures.loc[outliers.index] = fetures.loc[outliers.index]* np.nan
    print (f"outler values :\n {fetures.isna().sum()}")
    if return_outlier:
        return outliers
        exit(0)
    else:
        fetures = handle_missing(fetures)
        print(f"\n data after handling outlier \n {fetures.isna().sum()}")
    return fetures
    

##### How to use these Functions !!

### `data = convert(train_data)`
### `data = handle_missing(data)`


In [13]:
y = data['Survived']
x = data.drop(columns=['Survived'])
x.shape , y.shape

((891, 7), (891,))

In [14]:
x = detect_outlier(x) 

outler values :
 Pclass      89
Sex         89
Age         89
SibSp       89
Parch       89
Fare        89
Embarked    89
dtype: int64

 data after handling outlier 
 Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [15]:
y.value_counts()

0.0    549
1.0    342
Name: Survived, dtype: int64

In [16]:
x,y,x_norm = balance_data(x,y)

In [17]:
x 

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3.000000,1.000000,22.000000,1.000000,0.000000,7.250000,2.000000
1,1.000000,0.000000,38.000000,1.000000,0.000000,71.283300,0.000000
2,3.000000,0.000000,26.000000,0.000000,0.000000,7.925000,2.000000
3,1.000000,0.000000,35.000000,1.000000,0.000000,53.100000,2.000000
4,3.000000,1.000000,35.000000,0.000000,0.000000,8.050000,2.000000
...,...,...,...,...,...,...,...
1093,2.286783,0.628429,29.479618,0.557357,0.410224,32.793531,1.566085
1094,3.000000,0.000000,22.847062,0.960781,0.078437,16.123531,2.000000
1095,2.000000,0.000000,27.839380,0.000000,0.000000,12.623230,1.910767
1096,1.000000,0.000000,20.724708,2.454118,2.000000,262.658824,0.908236


In [68]:
x_norm

array([[ 0.97474829,  0.93018736, -0.5611133 ,  0.44600622, -0.64147414],
       [-1.51205995, -1.2370795 ,  0.70639529,  0.44600622,  0.79452762],
       [ 0.97474829, -1.2370795 , -0.24423615, -0.54430615, -0.62633669],
       ...,
       [-0.26865583, -1.2370795 , -0.0985218 , -0.54430615, -0.52097485],
       [-1.51205995, -1.2370795 , -0.66214102,  1.88603726,  5.08628815],
       [ 0.97474829, -1.2370795 , -0.90967816, -0.54430615, -0.62736379]])

In [69]:
y

0       0.0
1       1.0
2       1.0
3       1.0
4       0.0
       ... 
1093    1.0
1094    1.0
1095    1.0
1096    1.0
1097    1.0
Name: Survived, Length: 1098, dtype: float64

### Function to perform future selection on data 
##### it require the Futures (x) and scaled Fetures (x_norm) and Labels (y) , and the number of fetures to select 
##### its default = 10 

In [18]:
def future_selection(x,x_normlize,y,base_model =svm.SVC(kernel='linear',max_iter=-1) ,n_features_to_select=10 ) : 
    
    
    feature_selector = feature_selection.RFE(base_model,n_features_to_select=n_features_to_select)
    feature_selector.fit(x_normlize,y)
    new_x = feature_selector.transform(x_normlize)
    print(f"the shape of new data : {new_x.shape}")
    
    selected_col = feature_selector.get_support()
    return(x.loc[:,selected_col])


In [70]:
# this function nessery for the cross validition function 
def train_test_split(x, y, train_indices, test_indices):
    return x[train_indices], x[test_indices], y[train_indices], y[test_indices]

### this function will return a data frame contain the number of fetures in the data and the accuracy of base model (svm) on it 
##### it require the fetures (x) and the scaled fetures(x_norm) and the base model (svm by default) if you want to change it 
Note: there are another methode to get the best number of fetures to select like `elpow methode` 

In [20]:
def get_the_best_number_of_futures_to_select( x,x_norm, base_model = svm.SVC(kernel='linear',max_iter=-1) ):
    
    kfold = model_selection.StratifiedShuffleSplit(n_splits=10,random_state=42)
    
    
    best_k = {
    'k':[],
    'f1_score':[],
    'recall_score':[],
    'precision_score':[]
                }
    
    for k in tqdm(range(len(x.columns))):
        f1_score = []
        recall_score = []
        precision_score = []
        k +=1

    # Create the linear model
    # Define the feature selection method
        feature_selector = feature_selection.RFE(base_model,n_features_to_select=k)
    # fit the feature Selection method on the x_norm and y
        feature_selector.fit(x_norm,y)
    # get the new fetures 
        new_x = feature_selector.transform(x_norm)

    # Evaluate svm on the features for defiant k values form 1 to 20
        for train_indexes,test_indexes in kfold.split(new_x,y):

            x_train,x_test,y_train,y_test = train_test_split(new_x,y,train_indexes,test_indexes,)

            model = svm.SVC()
            model.fit(x_train,y_train)
            pred = model.predict(x_test) 


            f1_score.append(metrics.f1_score(y_test,pred))
            recall_score.append(metrics.recall_score(y_test,pred))
            precision_score.append( metrics.precision_score(y_test,pred))



        best_k['k'].append(k)
        best_k['f1_score'].append( np.mean(f1_score))
        best_k['recall_score'].append(np.mean( recall_score))
        best_k['precision_score'].append( np.mean(precision_score))
    scores_df = pd.DataFrame(best_k)
    return scores_df

    


### here you will pass the fetures that you have chooce and labels and the model and the function will return 
### a data frame contain the performance of the model on each pach of the data 
#### you can take the average of that dataframe so you will get a better lock on your model performence 

In [71]:
def cross_validition (  x_norm,y,model = svm.SVC(),task ='Classification'   ):
    
    if task == 'Classification':
        scores_df=[]

        
        kfold = model_selection.StratifiedShuffleSplit(n_splits=10)
        accuracy = []
        precision = []
        recall = []
        f1_score = []
        test_idicses = []
        
        for train_indices, test_indices in kfold.split(x_norm,y): 
            x_train, x_test, y_train, y_test = train_test_split(x_norm,y, train_indices, test_indices)
            
            # Train model
            model.fit(x_train, y_train)
    
    
            # Predict using test set
            y_pred = model.predict(x_test)
    
             # Calculate scores
            accuracy.append( metrics.accuracy_score(y_test, y_pred))
            precision.append( metrics.precision_score(y_test, y_pred))
            recall.append( metrics.recall_score(y_test, y_pred))
            f1_score.append(metrics.f1_score(y_test, y_pred))
            test_idicses.append(test_indices)
        
        scores_df = pd.DataFrame({"accuracy": accuracy, 
                          "precision": precision, 
                          "recall": recall,
                          "F1":f1_score  , 
                        
                         'test_indices' :test_idicses})
    
            
            
    return scores_df
    
    
    
    

# implement of the functions on the titanic data 

In [74]:
# here we convert and handle missing values 
data = convert(train_data)
data = handle_missing(data)

In [79]:
data.describe()
# from locking on the data we see that there is a very little outlier so we wil hundel it 

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,30.064819,0.523008,0.381594,32.204208,1.538721
std,0.486592,0.836071,0.47799,13.644439,1.102743,0.806057,49.693429,0.794231
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,21.0,0.0,0.0,7.9104,1.0
50%,0.0,3.0,1.0,29.0,0.0,0.0,14.4542,2.0
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,3.0


In [86]:
# split the data 
y = data['Survived']
x = data.drop(columns=['Survived'])
x.shape , y.shape


((891, 7), (891,))

In [87]:
x = detect_outlier(x)

outler values :
 Pclass      89
Sex         89
Age         89
SibSp       89
Parch       89
Fare        89
Embarked    89
dtype: int64

 data after handling outlier 
 Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [91]:
x.describe() 
# now we could see that this little outlier gone 

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.286783,0.628429,29.479618,0.557357,0.410224,32.793531,1.566085
std,0.798904,0.458713,12.528054,1.071522,0.788995,40.375151,0.733621
min,1.0,0.0,0.42,0.0,0.0,6.4375,0.0
25%,2.0,0.0,22.0,0.0,0.0,8.05,1.566085
50%,2.286783,1.0,29.479618,0.0,0.0,21.0,2.0
75%,3.0,1.0,36.0,1.0,0.410224,32.793531,2.0
max,3.0,1.0,70.0,8.0,6.0,263.0,3.0


In [93]:
# now lets check for data balance 
y.value_counts()
# we need to balance it 

0.0    549
1.0    342
Name: Survived, dtype: int64

In [94]:
x,y,x_norm = balance_data(x,y) 

In [95]:
y.value_counts() 

0.0    549
1.0    549
Name: Survived, dtype: int64

In [96]:
# now we need to select only the corrleted futures so we will get first the number of the best fetures 
get_the_best_number_of_futures_to_select(x,x_norm)

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:05<00:00,  1.17it/s]


Unnamed: 0,k,f1_score,recall_score,precision_score
0,1,0.733425,0.669091,0.817835
1,2,0.750434,0.683636,0.835149
2,3,0.809017,0.796364,0.823913
3,4,0.830476,0.818182,0.844784
4,5,0.821634,0.8,0.846748
5,6,0.812767,0.785455,0.845111
6,7,0.820407,0.792727,0.852084


In [99]:
# i will use 7 columns 
best_fetures  = future_selection(x,x_norm,y ,n_features_to_select=7)


the shape of new data : (1098, 7)


In [100]:
best_fetures

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3.000000,1.000000,22.000000,1.000000,0.0,7.250000,2.000000
1,1.000000,0.000000,38.000000,1.000000,0.0,71.283300,0.000000
2,3.000000,0.000000,26.000000,0.000000,0.0,7.925000,2.000000
3,1.000000,0.000000,35.000000,1.000000,0.0,53.100000,2.000000
4,3.000000,1.000000,35.000000,0.000000,0.0,8.050000,2.000000
...,...,...,...,...,...,...,...
1093,1.000000,1.000000,35.000000,0.000000,0.0,26.384938,1.257613
1094,1.000000,0.675797,50.351595,0.000000,0.0,26.348735,2.000000
1095,3.000000,0.000000,17.600000,0.000000,0.0,7.847756,1.000000
1096,3.000000,0.000000,22.989500,0.000000,0.0,7.552100,1.989500


In [104]:
# now we will fit this fetures and labels in the svm model using cross validition 
# note that we need to scale the fetures before 

x_norm ,y = balance_data(best_fetures,y ,only_scaling=True)

In [105]:
x_norm

array([[ 0.95770423,  0.9251446 , -0.55735099, ..., -0.54545382,
        -0.66035031,  0.62111035],
       [-1.52929896, -1.2535902 ,  0.69932606, ..., -0.54545382,
         0.90649463, -2.16507082],
       [ 0.95770423, -1.2535902 , -0.24318173, ..., -0.54545382,
        -0.64383359,  0.62111035],
       ...,
       [ 0.95770423, -1.2535902 , -0.90293718, ..., -0.54545382,
        -0.64572367, -0.77198024],
       [ 0.95770423, -1.2535902 , -0.47963337, ..., -0.54545382,
        -0.65295816,  0.60648299],
       [ 0.26014874,  0.9251446 , -2.14526335, ...,  2.03801468,
        -0.10249597,  0.62111035]])

In [106]:
y 

0       0.0
1       1.0
2       1.0
3       1.0
4       0.0
       ... 
1093    1.0
1094    1.0
1095    1.0
1096    1.0
1097    1.0
Name: Survived, Length: 1098, dtype: float64

In [110]:
scores = cross_validition(x_norm,y)

In [111]:
scores.mean()

  scores.mean()


accuracy     0.823636
precision    0.855934
recall       0.780000
F1           0.815395
dtype: float64