Import the required libraries

In [22]:
import pandas as pd
from pyspark.sql import Row

Create the spark session

In [23]:
from pyspark.sql import SparkSession

Initialize the spark session and connection to mysql

In [25]:

spark = SparkSession.builder.appName("erick")\
        .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.32')\
        .getOrCreate()
sqlContext = SparkSession(spark)
spark.sparkContext.setLogLevel("ERROR")


In [26]:
spark

Load my dataset

In [5]:
input_df= spark.read.csv('BreastCancer.csv')

                                                                                

The dataset has many columns therefore am using pandas to view these columns

In [27]:
pd.DataFrame(input_df.take(10), columns=input_df.columns).head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Setting headers to true and inferschema to true to have the right format for my data

In [28]:
input_df=spark.read.option('header','true').csv('BreastCancer.csv',inferSchema=True)

viewing datatypes of the columns

In [29]:
input_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- radius_mean: double (nullable = true)
 |-- texture_mean: double (nullable = true)
 |-- perimeter_mean: double (nullable = true)
 |-- area_mean: double (nullable = true)
 |-- smoothness_mean: double (nullable = true)
 |-- compactness_mean: double (nullable = true)
 |-- concavity_mean: double (nullable = true)
 |-- concave points_mean: double (nullable = true)
 |-- symmetry_mean: double (nullable = true)
 |-- fractal_dimension_mean: double (nullable = true)
 |-- radius_se: double (nullable = true)
 |-- texture_se: double (nullable = true)
 |-- perimeter_se: double (nullable = true)
 |-- area_se: double (nullable = true)
 |-- smoothness_se: double (nullable = true)
 |-- compactness_se: double (nullable = true)
 |-- concavity_se: double (nullable = true)
 |-- concave points_se: double (nullable = true)
 |-- symmetry_se: double (nullable = true)
 |-- fractal_dimension_se: double (nullable = true)
 |-- radi

In [9]:
input_df.head(1)

                                                                                

[Row(id=842302, diagnosis='M', radius_mean=17.99, texture_mean=10.38, perimeter_mean=122.8, area_mean=1001.0, smoothness_mean=0.1184, compactness_mean=0.2776, concavity_mean=0.3001, concave points_mean=0.1471, symmetry_mean=0.2419, fractal_dimension_mean=0.07871, radius_se=1.095, texture_se=0.9053, perimeter_se=8.589, area_se=153.4, smoothness_se=0.006399, compactness_se=0.04904, concavity_se=0.05373, concave points_se=0.01587, symmetry_se=0.03003, fractal_dimension_se=0.006193, radius_worst=25.38, texture_worst=17.33, perimeter_worst=184.6, area_worst=2019.0, smoothness_worst=0.1622, compactness_worst=0.6656, concavity_worst=0.7119, concave points_worst=0.2654, symmetry_worst=0.4601, fractal_dimension_worst=0.1189)]

Checking the number of rows

In [30]:
input_df.count()

569

Selecting columns i need in the analysis

In [33]:
selected_columns=input_df.select(['diagnosis',
                 'radius_mean','radius_worst',
                 'texture_mean','texture_worst',
                 'compactness_mean','compactness_worst',
                 'concavity_mean','concavity_worst',])


In [34]:
selected_columns.show()

+---------+-----------+------------+------------+-------------+----------------+-----------------+--------------+---------------+
|diagnosis|radius_mean|radius_worst|texture_mean|texture_worst|compactness_mean|compactness_worst|concavity_mean|concavity_worst|
+---------+-----------+------------+------------+-------------+----------------+-----------------+--------------+---------------+
|        M|      17.99|       25.38|       10.38|        17.33|          0.2776|           0.6656|        0.3001|         0.7119|
|        M|      20.57|       24.99|       17.77|        23.41|         0.07864|           0.1866|        0.0869|         0.2416|
|        M|      19.69|       23.57|       21.25|        25.53|          0.1599|           0.4245|        0.1974|         0.4504|
|        M|      11.42|       14.91|       20.38|         26.5|          0.2839|           0.8663|        0.2414|         0.6869|
|        M|      20.29|       22.54|       14.34|        16.67|          0.1328|          

Filtering the columns to remain with the needed rows

In [35]:
filtered_df=selected_columns.filter((input_df.concavity_mean<0.1)& (input_df.texture_mean>20))

In [36]:
filtered_df.show()

+---------+-----------+------------+------------+-------------+----------------+-----------------+--------------+---------------+
|diagnosis|radius_mean|radius_worst|texture_mean|texture_worst|compactness_mean|compactness_worst|concavity_mean|concavity_worst|
+---------+-----------+------------+------------+-------------+----------------+-----------------+--------------+---------------+
|        M|      13.71|       17.06|       20.83|        28.14|          0.1645|           0.3682|       0.09366|         0.2678|
|        M|      16.02|       19.19|       23.24|        33.88|         0.06669|           0.1551|       0.03299|         0.1459|
|        M|      15.85|       16.84|       23.95|        27.66|          0.1002|           0.1924|       0.09938|         0.2322|
|        M|      14.68|       19.07|       20.13|        30.88|           0.072|           0.1871|       0.07395|         0.2914|
|        M|      14.99|       14.99|        25.2|         25.2|         0.05131|          

Gouping needed columns to display the information i need

In [38]:
grouped_patients=filtered_df.groupby('diagnosis').max('radius_mean',\
                                             'radius_worst',\
                                             'texture_mean',\
                                             'compactness_mean',\
                                             'concavity_mean')

In [21]:
grouped_patients.show()

[Stage 16:>                                                         (0 + 1) / 1]

+---------+----------------+-----------------+-----------------+---------------------+-------------------+
|diagnosis|max(radius_mean)|max(radius_worst)|max(texture_mean)|max(compactness_mean)|max(concavity_mean)|
+---------+----------------+-----------------+-----------------+---------------------+-------------------+
|        B|           14.99|            16.76|            33.81|               0.1552|            0.09252|
|        M|           20.48|            24.31|            29.81|               0.1645|            0.09938|
+---------+----------------+-----------------+-----------------+---------------------+-------------------+



                                                                                

Writing the needed information in my sql databse

In [39]:

grouped_patients.write \
  .format("jdbc") \
  .option("driver","com.mysql.cj.jdbc.Driver") \
  .option("url", "jdbc:mysql://192.168.0.101:3306/erick") \
  .option("dbtable", "grouped_patients") \
  .option("user", "root") \
  .option("password", "mysql") \
  .save()

when i check my workbench the table and the data has been correctly