In [1]:
import os
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf


conf = SparkConf().setAppName("users_csv_deom").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [2]:
from pyspark import SQLContext
sqlContext = SQLContext(sc)

In [7]:
df = sqlContext.read.csv("users.csv", inferSchema = True, header = True)

In [9]:
df.show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [11]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, DoubleType, LongType

In [12]:
fileSchema = StructType([StructField('name', StringType(),True),
                        StructField('age', IntegerType(),True),
                        StructField('job', StringType(),True)])

df2 = sqlContext.read.load("users.csv", format="csv", sep=",", schema = fileSchema, header="true") 

In [13]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)



In [14]:
df2.show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [16]:
df.write.parquet("users_2.parquet")

In [17]:
df.write.json("users_2.json")

In [18]:
df.write.orc("users_2.orc")

In [20]:
#Specifying the schema in the case of json
fileSchema = StructType([StructField('name', StringType(),True),
                        StructField('age', IntegerType(),True),
                        StructField('job', StringType(),True)])

df2 = sqlContext.read.json("users_2.json", schema = fileSchema) 

In [21]:
df.show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [22]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)



In [23]:
import numpy as np
import pandas as pd

In [24]:
pdf = pd.DataFrame(np.random.rand(10,3))

In [25]:
print(pdf)

          0         1         2
0  0.162102  0.752270  0.503119
1  0.111237  0.334659  0.546670
2  0.334442  0.379681  0.779174
3  0.716286  0.178263  0.621574
4  0.782467  0.478217  0.185836
5  0.587427  0.246560  0.433718
6  0.987351  0.452952  0.257361
7  0.634099  0.807839  0.506333
8  0.671664  0.769215  0.560576
9  0.817503  0.414078  0.240089


In [26]:
df_np = sqlContext.createDataFrame(pdf)

In [27]:
df_np.printSchema()

root
 |-- 0: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)



In [28]:
df_np.show(5)

+-------------------+-------------------+-------------------+
|                  0|                  1|                  2|
+-------------------+-------------------+-------------------+
|0.16210162999034838| 0.7522696067598241| 0.5031188309558745|
|0.11123681289662746| 0.3346588015370122| 0.5466703099942103|
|  0.334442398836732|0.37968123951754695| 0.7791742407229317|
|  0.716285881179824|0.17826272138169352| 0.6215743897464934|
| 0.7824669889113246|0.47821744106183406|0.18583555718525424|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [30]:
result_df = df_np.select("*").toPandas()

In [31]:
print(result_df)

          0         1         2
0  0.162102  0.752270  0.503119
1  0.111237  0.334659  0.546670
2  0.334442  0.379681  0.779174
3  0.716286  0.178263  0.621574
4  0.782467  0.478217  0.185836
5  0.587427  0.246560  0.433718
6  0.987351  0.452952  0.257361
7  0.634099  0.807839  0.506333
8  0.671664  0.769215  0.560576
9  0.817503  0.414078  0.240089


In [34]:
df_np.describe().show()

+-------+-------------------+-------------------+-------------------+
|summary|                  0|                  1|                  2|
+-------+-------------------+-------------------+-------------------+
|  count|                 10|                 10|                 10|
|   mean| 0.5804577656585823|0.48137340852102384|0.46344514613151044|
| stddev| 0.2882837802684245|0.22287098183631648|0.18701555054595884|
|    min|0.11123681289662746|0.17826272138169352|0.18583555718525424|
|    max| 0.9873510557885089| 0.8078389669792966| 0.7791742407229317|
+-------+-------------------+-------------------+-------------------+



In [33]:
result_df.describe()

Unnamed: 0,0,1,2
count,10.0,10.0,10.0
mean,0.580458,0.481373,0.463445
std,0.288284,0.222871,0.187016
min,0.111237,0.178263,0.185836
25%,0.397688,0.345914,0.30145
50%,0.652882,0.433515,0.504726
75%,0.765922,0.683757,0.5571
max,0.987351,0.807839,0.779174


In [35]:
df_np

DataFrame[0: double, 1: double, 2: double]

In [36]:
df_iris = sqlContext.read.csv("IRIS.csv", inferSchema = True, header = True)

In [38]:
df_iris.filter(df_iris["species"] == 'Iris-setosa').count()

50

In [40]:
df_iris.filter(df_iris['species'] == 'Iris-versicolor').groupBy('species').sum('sepal_width','petal_length').show()

+---------------+------------------+------------------+
|        species|  sum(sepal_width)| sum(petal_length)|
+---------------+------------------+------------------+
|Iris-versicolor|138.50000000000003|212.99999999999997|
+---------------+------------------+------------------+



In [42]:
df_iris.filter(groupBy('species').sum('sepal_width','petal_length').show()

SyntaxError: unexpected EOF while parsing (<ipython-input-42-626b8cb511a3>, line 1)

In [43]:
df_iris.filter((df_iris['species']=="Iris-setosa") & (df_iris['sepal_width']>4) & (df_iris['sepal_width']<5)).count()

3

In [44]:
df_iris.filter(df_iris['species'] == 'Iris-virginica' ).groupBy('species').min('petal_width').show()



+--------------+----------------+
|       species|min(petal_width)|
+--------------+----------------+
|Iris-virginica|             1.4|
+--------------+----------------+



In [45]:
df_iris.groupBy('species').min('petal_width').show()

+---------------+----------------+
|        species|min(petal_width)|
+---------------+----------------+
| Iris-virginica|             1.4|
|    Iris-setosa|             0.1|
|Iris-versicolor|             1.0|
+---------------+----------------+



In [None]:

df.filter(df['species'] == 'Iris-virginica' ).groupBy('species').max('petal_width').show(), 0.4