# Metoda K-średnich
## Eksperyment na danych z 'Titanic Dataset'

### Uruchomienie sesji sparka i wczytanie danych z pliku .csv

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName('Przewidywanie szans na przeżycie katastrofy Titanica w zależności od różnych czynników')\
        .getOrCreate()

rawData = spark.read.format('csv').\
option('header','true').\
load('../datasets/titanic.csv')

### konwersja pliku .csv do DataFrame

In [2]:
rawData.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


### Wybór interesujących nas kolumn

In [5]:
from pyspark.sql.functions import col

dataset = rawData.select(col('Survived').cast('float'),
                        col('Pclass').cast('float'),
                        col('Sex'),
                        col('Age').cast('float'),
                        col('Fare').cast('float'),
                        col('Embarked'))
dataset.toPandas().head()


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0.0,3.0,male,22.0,7.25,S
1,1.0,1.0,female,38.0,71.283302,C
2,1.0,3.0,female,26.0,7.925,S
3,1.0,1.0,female,35.0,53.099998,S
4,0.0,3.0,male,35.0,8.05,S


### zamiana brakujacych danych z '?' na wartość None i usunięcie wierszy z brakami danych

In [7]:
dataset = dataset.replace('?', None).dropna(how = 'any')

### konwersja danych z kolumn sex, embarked z formatu string na wartosc numeryczna i dodanie ich w nowych kolumnach

In [8]:
from pyspark.ml.feature import StringIndexer

dataset = StringIndexer(
    inputCol = 'Sex',
    outputCol = 'Gender',
    handleInvalid = 'keep').fit(dataset).transform(dataset)

dataset = StringIndexer(
    inputCol = 'Embarked',
    outputCol = 'Boarded',
    handleInvalid = 'keep').fit(dataset).transform(dataset)

dataset.toPandas().head()


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Gender,Boarded
0,0.0,3.0,male,22.0,7.25,S,0.0,0.0
1,1.0,1.0,female,38.0,71.283302,C,1.0,1.0
2,1.0,3.0,female,26.0,7.925,S,1.0,0.0
3,1.0,1.0,female,35.0,53.099998,S,1.0,0.0
4,0.0,3.0,male,35.0,8.05,S,0.0,0.0


In [9]:
dataset = dataset.drop('Sex')
dataset = dataset.drop('Embarked')


In [10]:
requiredFeatures = ['Survived',
                   'Pclass',
                   'Age',
                   'Fare',
                   'Gender',
                   'Boarded'
                   ]

### Transformacja danych do postaci w której wszystkie cechy są jednym wektorem w nowej kolumnie

In [11]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = requiredFeatures, outputCol = 'features')

In [12]:
transformed_data = assembler.transform(dataset)
transformed_data.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded,features
0,0.0,3.0,22.0,7.25,0.0,0.0,"[0.0, 3.0, 22.0, 7.25, 0.0, 0.0]"
1,1.0,1.0,38.0,71.283302,1.0,1.0,"[1.0, 1.0, 38.0, 71.2833023071289, 1.0, 1.0]"
2,1.0,3.0,26.0,7.925,1.0,0.0,"[1.0, 3.0, 26.0, 7.925000190734863, 1.0, 0.0]"
3,1.0,1.0,35.0,53.099998,1.0,0.0,"[1.0, 1.0, 35.0, 53.099998474121094, 1.0, 0.0]"
4,0.0,3.0,35.0,8.05,0.0,0.0,"[0.0, 3.0, 35.0, 8.050000190734863, 0.0, 0.0]"


### Klasteryzcja za pomocą algorytmu k-średnich

In [15]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=5, seed = 1) #parametry modelu k- ilosc klastrow
model = kmeans.fit(transformed_data) #inicjalizacja modelu

### Ewaluacja wyników (idealny wynik to wspolczynik silhouette  = 1

In [20]:
clusteredData = model.transform(transformed_data)


from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(clusteredData)
print('silhouette with sqared euclidan distance = ', silhouette)

silhouette with sqared euclidan distance =  0.5653938592454313


In [21]:
centers = model.clusterCenters()
print('cluster centers: ')
for center in centers:
    print(center)

cluster centers: 
[ 0.2832244   2.54466231 32.12527233 13.1086776   0.28104575  0.20697168]
[  0.76470588   1.          31.23529412 231.15367396   0.76470588
   0.47058824]
[ 0.67857143  1.07142857 35.57047619 95.07713104  0.55952381  0.5       ]
[ 0.5704698   2.12751678 18.35402685 34.50953624  0.46308725  0.25503356]
[1.00000000e+00 1.00000000e+00 3.53333333e+01 5.12329224e+02
 3.33333333e-01 1.00000000e+00]


In [23]:
clusteredData.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded,features,prediction
0,0.0,3.0,22.0,7.25,0.0,0.0,"[0.0, 3.0, 22.0, 7.25, 0.0, 0.0]",0
1,1.0,1.0,38.0,71.283302,1.0,1.0,"[1.0, 1.0, 38.0, 71.2833023071289, 1.0, 1.0]",2
2,1.0,3.0,26.0,7.925,1.0,0.0,"[1.0, 3.0, 26.0, 7.925000190734863, 1.0, 0.0]",0
3,1.0,1.0,35.0,53.099998,1.0,0.0,"[1.0, 1.0, 35.0, 53.099998474121094, 1.0, 0.0]",3
4,0.0,3.0,35.0,8.05,0.0,0.0,"[0.0, 3.0, 35.0, 8.050000190734863, 0.0, 0.0]",0


### Eksploracja danych

In [26]:
from pyspark.sql.functions import *

dataset.select(avg('Survived'),
               avg('Pclass'),
               avg('Age'),
               avg('Fare'),
               avg('Gender'),
               avg('Boarded')).toPandas()

Unnamed: 0,avg(Survived),avg(Pclass),avg(Age),avg(Fare),avg(Gender),avg(Boarded)
0,0.404494,2.240169,29.642093,34.567251,0.363764,0.261236


In [30]:
clusteredData.groupBy('prediction').agg(avg('Survived'),
               avg('Pclass'),
               avg('Age'),
               avg('Fare'),
               avg('Gender'),
               avg('Boarded'),
                count('prediction')).orderBy('prediction').toPandas()

Unnamed: 0,prediction,avg(Survived),avg(Pclass),avg(Age),avg(Fare),avg(Gender),avg(Boarded),count(prediction)
0,0,0.283224,2.544662,32.125272,13.108678,0.281046,0.206972,459
1,1,0.764706,1.0,31.235294,231.153674,0.764706,0.470588,17
2,2,0.678571,1.071429,35.570476,95.077131,0.559524,0.5,84
3,3,0.57047,2.127517,18.354027,34.509536,0.463087,0.255034,149
4,4,1.0,1.0,35.333333,512.329224,0.333333,1.0,3


### Z otrzymanych danych można zauważyć, że jedną z grup stanowią osoby w średnim wieku podróżujące 
### klasą 2 lub 3 które zapłaciły za bilet niewiele.
### Powtarzajac eksperyment ze zmienioną ilością klastrów można wyciągnąć podobne wnioski dla innych grup
### np. wyższą sznsę przeżycia kobiet niż meżczyzn we wszystkich przedziałach klasowych,
### wiele niższą szansę przeżycia osób podróżujących klasą 2 lub 3, itd.