In [None]:
#导入相关包
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
#设置sns样式
sns.set(style='white',context='notebook',palette='muted')
import matplotlib.pyplot as plt
#导入数据
train=pd.read_csv('./data/train.csv')
test=pd.read_csv('./data/test.csv')

In [None]:
#分别查看实验数据集和预测数据集数据
print('实验数据大小:',train.shape)
print('预测数据大小:',test.shape)

In [None]:
#将实验数据和预测数据合并
full=train._append(test,ignore_index=True)
full.info()
full.describe()

In [None]:
sns.barplot(data=train,x='Embarked',y='Survived')

In [None]:
#计算不同类型embarked的乘客，其生存率为多少
print('Embarked为"S"的乘客，其生存率为%.2f'%full['Survived'][full['Embarked']=='S'].value_counts(normalize=True)[1])
print('Embarked为"C"的乘客，其生存率为%.2f'%full['Survived'][full['Embarked']=='C'].value_counts(normalize=True)[1])
print('Embarked为"Q"的乘客，其生存率为%.2f'%full['Survived'][full['Embarked']=='Q'].value_counts(normalize=True)[1])

In [None]:
#法国登船乘客生存率较高原因可能与其头等舱乘客比例较高有关
sns.catplot(x='Pclass',data=train,col='Embarked',kind='count',height=3)

In [None]:
# 计算每个Embarked类别内Pclass的比例
proportions = train.groupby(['Embarked', 'Pclass']).size().unstack()
proportions = proportions.div(proportions.sum(axis=1), axis=0)

# 重塑数据以适应barplot
proportions = proportions.stack().reset_index(name='Proportion')

# 绘制图表
g = sns.catplot(x='Pclass', y='Proportion', col='Embarked', data=proportions, kind='bar', height=3)

# 显示具体的数字
for ax in g.axes.flat:
    for p in ax.patches:
        ax.annotate(f'{p.get_height():.2%}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

plt.show()

In [None]:
# 计算每个Embarked类别内Pclass的比例
proportions = train.groupby(['Embarked', 'Pclass']).size().unstack()
a=train.groupby(['Embarked','Pclass'])
print(f"row: 3 - col: 1 a -> {a}")
b=a.size()
print(f"{b}")
c=b.unstack()
print(f"{c}")
# proportions = proportions.div(proportions.sum(axis=1), axis=0)

# # 重塑数据以适应barplot
# proportions = proportions.stack().reset_index(name='Proportion')

In [None]:
sns.barplot(data=train,x='Parch',y='Survived')

In [None]:
sns.barplot(data=train,x='SibSp',y='Survived')

In [None]:
sns.barplot(data=train,x='Pclass',y='Survived')

In [None]:
sns.barplot(data=train,x='Sex',y='Survived')

In [None]:
#创建坐标轴
ageFacet=sns.FacetGrid(train,hue='Survived',aspect=3)
#作图，选择图形类型
ageFacet.map(sns.kdeplot,'Age',shade=True)
#其他信息：坐标轴范围、标签等
ageFacet.set(xlim=(0,train['Age'].max()))
ageFacet.add_legend()

In [None]:
#创建坐标轴
ageFacet=sns.FacetGrid(train,hue='Survived',aspect=3)
ageFacet.map(sns.kdeplot,'Fare',shade=True)
ageFacet.set(xlim=(0,150))
ageFacet.add_legend()

In [None]:
#查看fare分布
farePlot=sns.distplot(full['Fare'][full['Fare'].notnull()],label='skewness:%.2f'%(full['Fare'].skew()))
farePlot.legend(loc='best')

In [None]:
#对数化处理fare值
full['Fare']=full['Fare'].map(lambda x: np.log(x) if x>0 else 0)

In [None]:
#查看fare分布
farePlot=sns.distplot(full['Fare'][full['Fare'].notnull()],label='skewness:%.2f'%(full['Fare'].skew()))
farePlot.legend(loc='best')

In [None]:
#对Cabin缺失值进行处理，利用U（Unknown）填充缺失值
full['Cabin']=full['Cabin'].fillna('U')
full['Cabin'].head()

In [None]:
#对Embarked缺失值进行处理，查看缺失值情况
full[full['Embarked'].isnull()]

In [None]:
full['Embarked'].value_counts()

In [None]:
full['Embarked']=full['Embarked'].fillna('S')

In [None]:
full[full['Embarked'].isnull()]

In [None]:
full[full['Fare'].isnull()]

In [None]:
#构造新特征Title
full['Title']=full['Name'].map(lambda x:x.split(',')[1].split('.')[0].strip())
#查看title数据分布
full['Title'].value_counts()

In [None]:
#将title信息进行整合
TitleDict= {'Mr': 'Mr', 'Mlle': 'Miss', 'Miss': 'Miss', 'Master': 'Master', 'Jonkheer': 'Master', 'Mme': 'Mrs',
            'Ms': 'Mrs', 'Mrs': 'Mrs', 'Don': 'Royalty', 'Sir': 'Royalty', 'the Countess': 'Royalty', 'Dona': 'Royalty',
            'Lady': 'Royalty', 'Capt': 'Officer', 'Col': 'Officer', 'Major': 'Officer', 'Dr': 'Officer',
            'Rev': 'Officer'}

full['Title']=full['Title'].map(TitleDict)
full['Title'].value_counts()

In [None]:
#可视化分析Title与Survived之间关系
sns.barplot(data=full,x='Title',y='Survived')

In [None]:
full['familyNum']=full['Parch']+full['SibSp']+1
#查看familyNum与Survived
sns.barplot(data=full,x='familyNum',y='Survived')

In [None]:
#我们按照家庭成员人数多少，将家庭规模分为“小、中、大”三类：
def familysize(familyNum):
    if familyNum==1:
        return 0
    elif (familyNum>=2)&(familyNum<=4):
        return 1
    else:
        return 2

full['familySize']=full['familyNum'].map(familysize)
full['familySize'].value_counts()

In [None]:
#查看familySize与Survived
sns.barplot(data=full,x='familySize',y='Survived')

In [None]:
#提取Cabin字段首字母
full['Deck']=full['Cabin'].map(lambda x:x[0])
#查看不同Deck类型乘客的生存率
sns.barplot(data=full,x='Deck',y='Survived')

In [None]:
#提取各票号的乘客数量
TickCountDict=full['Ticket'].value_counts()
TickCountDict.head()

In [None]:
#将同票号乘客数量数据并入数据集中
full['TickCot']=full['Ticket'].map(TickCountDict)
full['TickCot'].head()

In [None]:

#查看TickCot与Survived之间关系
sns.barplot(data=full,x='TickCot',y='Survived')

In [None]:
full.info()

In [None]:
#按照TickCot大小，将TickGroup分为三类。
def TickCountGroup(num):
    if (num>=2)&(num<=4):
        return 0
    elif (num==1)|((num>=5)&(num<=8)):
        return 1
    else :
        return 2
#得到各位乘客TickGroup的类别
full['TickGroup']=full['TickCot'].map(TickCountGroup)
#查看TickGroup与Survived之间关系
sns.barplot(data=full,x='TickGroup',y='Survived')

In [None]:
#查看缺失值情况
full[full['Age'].isnull()].head()

In [1]:
import dtale
import pandas as pd 
dtale.show(pd.read_csv("./data/train.csv"))



In [None]:
import sweetviz as sv
sweet_report = sv.analyze(pd.read_csv("./data/train.csv"))
sweet_report.show_html("sweet_report.html")

In [None]:

import dabl
import pandas as pd
titanic = pd.read_csv(dabl.datasets.data_path("./data/train.csv"))
