In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('csvReader').getOrCreate()

In [2]:
path = 'dataset/'
students = spark.read.csv(path+'students.csv', inferSchema = True, header = True)


In [3]:
students.show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [4]:
students.limit(5).toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
#validate data
print(type(students))


<class 'pyspark.sql.dataframe.DataFrame'>


In [6]:
studentsPandas = students.toPandas()
print(type(studentsPandas))

<class 'pandas.core.frame.DataFrame'>


In [7]:
students.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing score: integer (nullable = true)



In [8]:
students.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [9]:
students.describe().show()

+-------+------+--------------+---------------------------+------------+-----------------------+------------------+------------------+-----------------+
|summary|gender|race/ethnicity|parental level of education|       lunch|test preparation course|        math score|     reading score|    writing score|
+-------+------+--------------+---------------------------+------------+-----------------------+------------------+------------------+-----------------+
|  count|  1000|          1000|                       1000|        1000|                   1000|              1000|              1000|             1000|
|   mean|  NULL|          NULL|                       NULL|        NULL|                   NULL|            66.089|            69.169|           68.054|
| stddev|  NULL|          NULL|                       NULL|        NULL|                   NULL|15.163080096009454|14.600191937252223|15.19565701086966|
|    min|female|       group A|         associate's degree|free/reduced|          

In [10]:
students.schema['math score'].dataType

IntegerType()

In [11]:
students.describe(['math score']).show()

+-------+------------------+
|summary|        math score|
+-------+------------------+
|  count|              1000|
|   mean|            66.089|
| stddev|15.163080096009454|
|    min|                 0|
|    max|               100|
+-------+------------------+



In [12]:
students.select("math score", "writing score", "reading score")\
.summary("min", "max", "count","25%", "50%", "75%")\
.show()

+-------+----------+-------------+-------------+
|summary|math score|writing score|reading score|
+-------+----------+-------------+-------------+
|    min|         0|           10|           17|
|    max|       100|          100|          100|
|  count|      1000|         1000|         1000|
|    25%|        57|           57|           59|
|    50%|        66|           69|           70|
|    75%|        77|           79|           79|
+-------+----------+-------------+-------------+



In [13]:
values =[('A',10),
         ('B',20),('C',30)]

df = spark.createDataFrame(values)

In [14]:
df.show()

+---+---+
| _1| _2|
+---+---+
|  A| 10|
|  B| 20|
|  C| 30|
+---+---+



In [15]:
#write data
students.write.mode('overwrite').csv(path+"students_1")