### 실습: 데싸노트의 실전에서 통하는 머신러닝

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#### 12장 K-평균 군집화

In [None]:
file_url= 'https://raw.githubusercontent.com/musthave-ML10/data_source/main/example_cluster.csv'
data=pd.read_csv(file_url)

In [None]:
data

In [None]:
sns.scatterplot(x='var_1',y='var_2', data=data)

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans_model=KMeans(n_clusters=3, random_state=100)
kmeans_model.fit(data)
kmeans_model.predict(data)

In [None]:
data['label']=kmeans_model.predict(data)

In [None]:
sns.scatterplot(x='var_1',y='var_2', data=data, hue='label', palette='rainbow')

In [None]:
from notebook.services.config import ConfigManager
c = ConfigManager()
c.update('notebook', {"CodeCell": {"cm_config": {"autoCloseBrackets": True}}})

In [None]:
kmeans_model.inertia_

In [None]:
# 엘보우 기법(elbow_method)
distance=[]
for k in range(2,10):
    k_model=KMeans(n_clusters=k)
    k_model.fit(data)
    distance.append(k_model.inertia_)

In [None]:
sns.lineplot(x=range(2,10), y=distance)

In [None]:
# 고객 데이터
file_url='https://raw.githubusercontent.com/musthave-ML10/data_source/main/customer.csv'
data=pd.read_csv(file_url)
data.head()

In [None]:
data.info()

In [None]:
data.cc_num.nunique()

In [None]:
data.category.nunique()

In [None]:
data_dummy=pd.get_dummies(data, columns=['category'])
data_dummy.head()

In [None]:
cat_list=data_dummy.columns[2:]
cat_list

In [None]:
for i in cat_list:
    data_dummy[i]=data_dummy[i]*data_dummy['amt']

In [None]:
data_dummy

In [None]:
data_agg=data_dummy.groupby('cc_num').sum()
data_agg

In [None]:
# scaling-정규화
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_df=pd.DataFrame(scaler.fit_transform(data_agg), columns=data_agg.columns, index=data_agg.index)
scaled_df

In [None]:
# elbow 실시
distance=[]

for k in range(2,10):
    k_model=KMeans(n_clusters=k)
    k_model.fit_transform(scaled_df)
    labels=k_model.predict(scaled_df)
    distance.append(k_model.inertia_)

In [None]:
sns.lineplot(x=range(2,10), y=distance)

In [None]:
# elbow 가 원만하게 떨어져 예측 어려움 -> 실루엣 계수(silhouette_score) 구하기
from sklearn.metrics import silhouette_score

silhouette=[]
for k in range(2,10):
    k_model=KMeans(n_clusters=k)
    k_model.fit_transform(scaled_df)
    labels=k_model.predict(scaled_df)
    silhouette.append(silhouette_score(scaled_df, labels))
sns.lineplot(x=range(2,10), y=silhouette)

In [None]:
# 최종 예측모델 및 결과해석
k_model=KMeans(n_clusters=4)
k_model.fit(scaled_df)
labels=k_model.predict(scaled_df)

scaled_df['label']=labels
scaled_df_mean=scaled_df.groupby('label').mean()
scaled_df_count=scaled_df.groupby('label').count()['category_travel']

In [None]:
scaled_df_mean

In [None]:
scaled_df_count=scaled_df_count.rename('count')

In [None]:
scaled_df_all=scaled_df_mean.join(scaled_df_count)
scaled_df_all

#### 13장 차원 축소: 주성분분석(PCA)

In [None]:
file_url= 'https://raw.githubusercontent.com/musthave-ML10/data_source/main/customer_pca.csv'
data=pd.read_csv(file_url)
data

In [None]:
# 1. 종속변수와 독립변수의 분리
y= data['label']
X=data.drop('label', axis=1)
print(y.shape, X.shape)

In [None]:
from sklearn.decomposition import PCA

pca_model=PCA(n_components=2)
data_pca=pca_model.fit_transform(X)
data_pca

In [None]:
data_pca=pd.DataFrame(data_pca, columns=['PC1','PC2'])
data_pca=data_pca.join(y)

In [None]:
data_pca.head()

In [None]:
sns.scatterplot(x='PC1',y='PC2',data=data_pca, hue='label', palette='rainbow')

In [None]:
pca_model.components_

In [None]:
df_data=pd.DataFrame(pca_model.components_, columns=X.columns)
df_data

In [None]:
sns.heatmap(df_data, cmap='coolwarm')

In [None]:
#2 차원축소로 학습시간 줄이고 성능향상시키기 : 지도학습에서 사용

In [None]:
file_url= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/anonymous.csv'
data=pd.read_csv(file_url)

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(data.drop('class',axis=1),data['class'], test_size=0.2, random_state=100)
scale=StandardScaler()
scale.fit(X_train)
scaled_X_train=scale.transform(X_train)
scaled_X_test=scale.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier(random_state=100)
model_rf.fit(scaled_X_train, y_train)

from sklearn.metrics import accuracy_score, roc_auc_score
pred_1=model_rf.predict(scaled_X_test)
accuracy_score(y_test,pred_1)

#### 선형회귀 -지도학습

In [None]:
# 보헝료 예측
file_url= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/insurance.csv'
data=pd.read_csv(file_url)
data.info()

In [None]:
round(data.describe(), 2)

In [None]:
X=data.drop('charges', axis=1)
y=data['charges']

X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=100)

print(X_train.shape,X_test.shape,y_train.shape)

In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)
pred=model.predict(X_test)

In [None]:
# 평가하기 1
comp=pd.DataFrame({'actual' : y_test, 'pred': pred})
comp

In [None]:
#방법2 그래프
sns.scatterplot(x='actual', y='pred', data=comp)

In [None]:
# 3. rmse 평균제곱근 편차

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mean_squared_error(y_test, pred, squared=False)

In [None]:
r2_score(y_test, pred)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
pd.Series(model.coef_, index=X.columns)

In [None]:
#### 로지스틱 회귀-지도

In [None]:
file_url= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/titanic.csv'
data=pd.read_csv(file_url)
data.head()

In [None]:
data.info()

In [None]:
data.corr(numeric_only=True)

In [None]:
sns.heatmap(data.corr(numeric_only=True), cmap='coolwarm', vmin=-1, vmax=1, annot=True)


In [None]:
data=data.drop(['Name','Ticket'],axis=1)
data

In [None]:
data=pd.get_dummies(data, columns=['Sex','Embarked'], drop_first=True)

In [None]:
X=data.drop('Survived',axis=1)
y=data['Survived']

X_train, X_test, y_train,y_test=train_test_split(X,y, test_size=0.2, random_state=100)
print(X_train.shape, y_train.shape,y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train)
pred=model.predict(X_test)
accuracy_score(y_test,pred)

In [None]:
model.coef_

In [None]:
pd.Series(model.coef_[0], index=X.columns)

In [None]:
# 다중공선성 줄이기: 상관성 많은 변수합치기

data['family']=data['SibSp'] +  data['Parch']
data.drop(['SibSp', 'Parch'], axis=1, inplace=True)
data.head()

In [None]:
X=data.drop('Survived', axis=1)
y=data['Survived']

In [None]:
X_train,X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=100)
model=LogisticRegression()
model.fit(X_train,y_train)
pred=model.predict(X_test)
accuracy_score(y_test, pred)

#### K-최근접이웃(KNN)

In [None]:
file_url= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/wine.csv'
data=pd.read_csv(file_url)
data.head()

In [None]:
data.info()

In [None]:
data['class'].value_counts()

In [None]:
sns.barplot(x=data['class'].value_counts().index, y=data['class'].value_counts())

In [None]:
data.isna().sum()

In [None]:
data.fillna(data.median(), inplace=True)
data.isna().sum()

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

st_scaler=StandardScaler()
st_scaler.fit(data)
st_scaled=st_scaler.transform(data)
st_scaled

In [None]:
st_scaled=pd.DataFrame(st_scaled, columns=data.columns)

In [None]:
round(st_scaled.describe(),2)

In [None]:
rb_scaler=RobustScaler()
rb_scaled=rb_scaler.fit_transform(data)
rb_scaled=pd.DataFrame(rb_scaled, columns=data.columns)

In [None]:
rb_scaled.head()

In [None]:
rb_scaler=MinMaxScaler()
rb_scaled=rb_scaler.fit_transform(data)
rb_scaled=pd.DataFrame(rb_scaled, columns=data.columns)
rb_scaled.head()

In [None]:
X_train,X_test, y_train, y_test=train_test_split(data.drop(['class'], axis=1),data['class'], test_size=0.2, random_state=100)
mm_scaler=MinMaxScaler()
mm_scaler.fit(X_train)
X_train_scaled=mm_scaler.transform(X_train)
X_test_scaled=mm_scaler.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
pred=knn.predict(X_test_scaled)
accuracy_score(y_test, pred)


In [None]:
scores=[]
for i, k in enumerate(range(1,21)):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    pred=knn.predict(X_test_scaled)
    acc=accuracy_score(y_test, pred)
    scores.append(acc)

In [None]:
sns.lineplot(x=range(1,21), y=scores)

In [None]:
knn=KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_scaled, y_train)
pred=knn.predict(X_test_scaled)
accuracy_score(y_test, pred)

#### 8장 결정트리

In [None]:
file_url= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/salary.csv'
data=pd.read_csv(file_url, skipinitialspace=True)

In [None]:
data['class'].unique()

In [None]:
data.info()

In [None]:
data.describe(include='all')

In [None]:
#종속변수 정리
data['class']=data['class'].map({'<=50K': 0, '>50k': 1})

In [None]:
data['class']=data['class'].apply(lambda x:0 if x == '<=50K' else 1)

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe=LabelEncoder()
data['class']=lbe.fit_transform(data['class'])

In [None]:
obj_list=[]
for i in data.columns:
    if data[i].dtype == 'object':
        obj_list.append(i)

In [None]:
obj_list

In [None]:
for i in obj_list:
    print(i,data[i].nunique())

In [None]:
for i in obj_list:
    if data[i].nunique() >= 10:
        print(i,data[i].nunique())

In [None]:
data['education'].value_counts()

In [None]:
data.drop('education',axis=1, inplace=True)

In [None]:
data.info()

In [None]:
data['native-country'].value_counts()

In [None]:
contry_group=data.groupby('native-country')['class'].mean()

In [None]:
contry_group.reset_index()

In [None]:
data=data.merge(contry_group,on='native-country',how='left')

#### 9장 랜덤포레스트

In [None]:
file_url= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/car.csv'
data=pd.read_csv(file_url, skipinitialspace=True)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
print(round(data.describe(),2))

In [None]:
#전처리: 단위 일치, 숫자형으로 변환
data[['engine','engine_unit']]=data['engine'].str.split(expand=True)

In [None]:
data.drop('engine_unit', axis=1, inplace=True)

In [None]:
data['engine']=data['engine'].astype('float32')

In [None]:
data['engine'].head()

In [None]:
data[['max_power','max_power_unit']]=data['max_power'].str.split(expand=True)

In [None]:
data.head()

In [None]:
data['max_power']=data['max_power'].apply(isFloat)

In [None]:
data['max_power_unit']

In [None]:
def isFloat(value):
    try:
        num=float(value)
        return num
    except ValueError:
        return np.NaN

In [None]:
data.drop('max_power_unit', axis=1, inplace=True)

In [None]:
data[['mileage','mileage_unit']]=data['mileage'].str.split(expand=True)

In [None]:
data['mileage']=data['mileage'].astype('float32')

In [None]:
data['mileage'].unique()

In [None]:
data['fuel'].unique()

In [None]:
def mile(x):
    if x['fuel']=='Petrol':
        return x['mileage'] /80.43
    elif x['fuel']=='Diesel':
        return x['mileage'] /73.56
    elif x['fuel']=='LPG':
        return x['mileage'] /40.85
    else:
        return x['mileage'] /73.23

In [None]:
data['mileage']=data.apply(mile, axis=1)

In [None]:
data.head()

In [None]:
data.drop('mileage_unit',axis=1, inplace=True)

In [None]:
data['torque']=data['torque'].str.upper()

In [None]:
def torque_unit(x):
    if 'NM' in str(x):
        return 'Nm'
    elif 'KGM' in str(x):
        return 'kgm'

In [None]:
data['torque_unit']=data['torque'].apply(torque_unit)

In [None]:
data['torque_unit'].unique()

In [None]:
data[data['torque_unit'].isna()]['torque'].unique()

In [None]:
data['torque_unit'].fillna('Nm', inplace=True)

In [None]:
def split_num(x):
    x=str(x)
    for i,j in enumerate(x):
        if j not in '0123456789.':
            cut=i
            break
    return x[:cut]

In [None]:
data['torque']=data['torque'].apply(split_num)

In [None]:
data['torque']

In [None]:
data['torque']=data['torque'].replace(' ',np.NaN)

In [None]:
data['torque']=data['torque'].astype('float32')

In [None]:
data['torque']

In [None]:
def torque_trans(x):
    if x['torque_unit'] == 'kgm':
        return x['torque'] * 9.8066
    else:
        return x['torque']

In [None]:
data['torque']=data.apply(torque_trans, axis=1)

In [None]:
data.drop('torque_unit', axis=1, inplace=True)

In [None]:
data['torque'].astype('float32')

In [None]:
data.info()

In [None]:
data['name']=data['name'].str.split(expand=True)[0]

In [None]:
data['name'].unique()

In [None]:
data['name']=data['name'].replace('Land','Land Rover')

In [None]:
data.isna().mean()

In [None]:
data.dropna(inplace=True)

In [None]:
data.info()

In [None]:
data=pd.get_dummies(data, columns=['name','fuel','seller_type','transmission','owner'])

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split, KFold

X_train, X_test, y_train, y_test = train_test_split(data.drop('selling_price', axis=1), data['selling_price'], test_size = 0.2, random_state=0)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

model=RandomForestRegressor(random_state=100)
model.fit(X_train, y_train)
train_pred=model.predict(X_train)
test_pred=model.predict(X_test)

print('train_rmse: ', mean_squared_error(y_train, train_pred) ** 0.5,
      'test_rmse: ', mean_squared_error(y_test, test_pred) ** 0.5)

In [None]:
data

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
kf=KFold(n_splits=5)
X=data.drop('selling_price', axis=1)
y=data['selling_price']

train_rmse_total=[]
test_rmse_total=[]


for train_index, test_index in kf.split(X):
    X_train, X_test =X.loc[train_index], X.loc[test_index]
    y_train, y_test =y[train_index], y[test_index]
    
    model=RandomForestRegressor(n_estimators=300, max_depth=50, min_samples_leaf=1, min_samples_split=15, n_jobs=-1,random_state=100)
    model.fit(X_train, y_train)
    train_pred=model.predict(X_train)
    test_pred=model.predict(X_test)
    train_rmse=mean_squared_error(y_train,train_pred) ** 0.5
    test_rmse=mean_squared_error(y_test, test_pred) ** 0.5
    train_rmse_total.append(train_rmse)
    test_rmse_total.append(test_rmse)


In [None]:
sum(test_rmse_total)/5

#### 10장 XGBoost

In [94]:
file_url= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/dating.csv'
data=pd.read_csv(file_url, skipinitialspace=True)

In [95]:
pd.options.display.max_columns=40

In [96]:
data.head()

Unnamed: 0,has_null,gender,age,age_o,race,race_o,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,sincere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o,attractive_important,sincere_important,intellicence_important,funny_important,ambtition_important,shared_interests_important,attractive_partner,sincere_partner,intelligence_partner,funny_partner,ambition_partner,shared_interests_partner,interests_correlate,expected_happy_with_sd_people,expected_num_interested_in_me,like,guess_prob_liked,met,match
0,0,female,21.0,27.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0,15.0,20.0,20.0,15.0,15.0,15.0,6.0,9.0,7.0,7.0,6.0,5.0,0.14,3.0,2.0,7.0,6.0,0.0,0
1,0,female,21.0,22.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,7.0,8.0,10.0,7.0,7.0,5.0,15.0,20.0,20.0,15.0,15.0,15.0,7.0,8.0,7.0,8.0,5.0,6.0,0.54,3.0,2.0,7.0,5.0,1.0,0
2,1,female,21.0,22.0,Asian/PacificIslander/Asian-American,Asian/PacificIslander/Asian-American,2.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,10.0,10.0,10.0,10.0,10.0,10.0,15.0,20.0,20.0,15.0,15.0,15.0,5.0,8.0,9.0,8.0,5.0,7.0,0.16,3.0,2.0,7.0,,1.0,1
3,0,female,21.0,23.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,7.0,8.0,9.0,8.0,9.0,8.0,15.0,20.0,20.0,15.0,15.0,15.0,7.0,6.0,8.0,7.0,6.0,8.0,0.61,3.0,2.0,7.0,6.0,0.0,1
4,0,female,21.0,24.0,Asian/PacificIslander/Asian-American,Latino/HispanicAmerican,2.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,8.0,7.0,9.0,6.0,9.0,7.0,15.0,20.0,20.0,15.0,15.0,15.0,5.0,6.0,7.0,7.0,6.0,6.0,0.21,3.0,2.0,6.0,6.0,0.0,1


In [97]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 39 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   has_null                       8378 non-null   int64  
 1   gender                         8378 non-null   object 
 2   age                            8283 non-null   float64
 3   age_o                          8274 non-null   float64
 4   race                           8315 non-null   object 
 5   race_o                         8305 non-null   object 
 6   importance_same_race           8299 non-null   float64
 7   importance_same_religion       8299 non-null   float64
 8   pref_o_attractive              8289 non-null   float64
 9   pref_o_sincere                 8289 non-null   float64
 10  pref_o_intelligence            8289 non-null   float64
 11  pref_o_funny                   8280 non-null   float64
 12  pref_o_ambitious               8271 non-null   f

In [98]:
data.describe()

Unnamed: 0,has_null,age,age_o,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,sincere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o,attractive_important,sincere_important,intellicence_important,funny_important,ambtition_important,shared_interests_important,attractive_partner,sincere_partner,intelligence_partner,funny_partner,ambition_partner,shared_interests_partner,interests_correlate,expected_happy_with_sd_people,expected_num_interested_in_me,like,guess_prob_liked,met,match
count,8378.0,8283.0,8274.0,8299.0,8299.0,8289.0,8289.0,8289.0,8280.0,8271.0,8249.0,8166.0,8091.0,8072.0,8018.0,7656.0,7302.0,8299.0,8299.0,8299.0,8289.0,8279.0,8257.0,8176.0,8101.0,8082.0,8028.0,7666.0,7311.0,8220.0,8277.0,1800.0,8138.0,8069.0,8003.0,8378.0
mean,0.87491,26.358928,26.364999,3.784793,3.651645,22.495347,17.396867,20.270759,17.459714,10.685375,11.84593,6.190411,7.175256,7.369301,6.400599,6.778409,5.47487,22.514632,17.396389,20.265613,17.457043,10.682539,11.845111,6.189995,7.175164,7.368597,6.400598,6.777524,5.474559,0.19601,5.534131,5.570556,6.134087,5.207523,0.049856,0.164717
std,0.33084,3.566763,3.563648,2.845708,2.805237,12.569802,7.044003,6.782895,6.085526,6.126544,6.362746,1.950305,1.740575,1.550501,1.954078,1.79408,2.156163,12.587674,7.0467,6.783003,6.085239,6.124888,6.362154,1.950169,1.740315,1.550453,1.953702,1.794055,2.156363,0.303539,1.734059,4.762569,1.841285,2.129565,0.282168,0.370947
min,0.0,18.0,18.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.83,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,24.0,24.0,1.0,1.0,15.0,15.0,17.39,15.0,5.0,9.52,5.0,6.0,6.0,5.0,6.0,4.0,15.0,15.0,17.39,15.0,5.0,9.52,5.0,6.0,6.0,5.0,6.0,4.0,-0.02,5.0,2.0,5.0,4.0,0.0,0.0
50%,1.0,26.0,26.0,3.0,3.0,20.0,18.37,20.0,18.0,10.0,10.64,6.0,7.0,7.0,7.0,7.0,6.0,20.0,18.18,20.0,18.0,10.0,10.64,6.0,7.0,7.0,7.0,7.0,6.0,0.21,6.0,4.0,6.0,5.0,0.0,0.0
75%,1.0,28.0,28.0,6.0,6.0,25.0,20.0,23.81,20.0,15.0,16.0,8.0,8.0,8.0,8.0,8.0,7.0,25.0,20.0,23.81,20.0,15.0,16.0,8.0,8.0,8.0,8.0,8.0,7.0,0.43,7.0,8.0,7.0,7.0,0.0,0.0
max,1.0,55.0,55.0,10.0,10.0,100.0,60.0,50.0,50.0,53.0,30.0,10.5,10.0,10.0,11.0,10.0,10.0,100.0,60.0,50.0,50.0,53.0,30.0,10.0,10.0,10.0,10.0,10.0,10.0,0.91,10.0,20.0,10.0,10.0,8.0,1.0


In [100]:
data.isna().sum()

has_null                            0
gender                              0
age                                95
age_o                             104
race                               63
race_o                             73
importance_same_race               79
importance_same_religion           79
pref_o_attractive                  89
pref_o_sincere                     89
pref_o_intelligence                89
pref_o_funny                       98
pref_o_ambitious                  107
pref_o_shared_interests           129
attractive_o                      212
sincere_o                         287
intelligence_o                    306
funny_o                           360
ambitous_o                        722
shared_interests_o               1076
attractive_important               79
sincere_important                  79
intellicence_important             79
funny_important                    89
ambtition_important                99
shared_interests_important        121
attractive_p

In [101]:
data=data.dropna(subset=['pref_o_attractive', 'pref_o_sincere' , 
                         'pref_o_intelligence','pref_o_funny','pref_o_ambitious','pref_o_shared_interests',
                         'attractive_important' ,'sincere_partner','intellicence_important','funny_important',
                         'ambtition_important','shared_interests_important'])

In [102]:
data=data.fillna(-99)

In [104]:
data.isna().sum()

has_null                         0
gender                           0
age                              0
age_o                            0
race                             0
race_o                           0
importance_same_race             0
importance_same_religion         0
pref_o_attractive                0
pref_o_sincere                   0
pref_o_intelligence              0
pref_o_funny                     0
pref_o_ambitious                 0
pref_o_shared_interests          0
attractive_o                     0
sincere_o                        0
intelligence_o                   0
funny_o                          0
ambitous_o                       0
shared_interests_o               0
attractive_important             0
sincere_important                0
intellicence_important           0
funny_important                  0
ambtition_important              0
shared_interests_important       0
attractive_partner               0
sincere_partner                  0
intelligence_partner

In [109]:
def age_gap(x):
    if x['age'] == -99:
        return -99
    elif x['age_o'] == -99:
        return -99
    elif x['gender'] == 'female':
        return x['age_o']-  x['age']
    else:
        return x['age']-  x['age_o']

In [110]:
data['age_gap']=data.apply(age_gap, axis=1)

In [111]:
data['age_gap_abs']=abs(data['age_gap'])

In [112]:
def sam_race(x):
    if x['race'] == -99:
        return -99
    elif x['race_o'] == -99:
        return -99
    elif x['race'] == x['race_o']:
        return 1
    else:
        return -1

In [113]:
data['same_race']=data.apply(sam_race, axis=1)

In [114]:
def same_race_point(x):
    if x['same_race'] ==-99:
        return -99
    else:
        return x['same_race'] * x['importance_same_race']

In [115]:
data['importance_same_race']=data.apply(same_race_point, axis=1)

In [116]:
def rating(data, importance, score):
    if data[importance]==-99:
        return -99
    elif data[score] == -99:
        return -99
    else:
        return data[importance] *data[score]

In [117]:
partner_imp=data.columns[8:14]
partner_rate_me=data.columns[14:20]
my_imp=data.columns[20:26]
my_rate_partner=data.columns[26:32]

In [119]:
new_label_partner=['attractive_p' ,'sincere_partner_p',
                   'intelligence_p','funny_p,ambitous_p','ambitous','shared_interests_p']
new_label_me=['attractive_m' ,'sincere_partner_m',
                   'intelligence_m','funny_p,ambitous_m','ambitous','shared_interests_m']

In [122]:
for i,j,k in zip(new_label_partner, partner_imp, partner_rate_me):
    data[i]= data.apply(lambda x: rating(x,j,k), axis=1)
    

In [125]:
data=pd.get_dummies(data, columns=['gender','race','race_o'], drop_first=True)

In [126]:
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(data.drop('match',axis=1), data['match'], test_size = 0.2, random_state=0)
model=xgb.XGBClassifier(