In [0]:
user_data= spark.read.format('csv').option('inferschema', True).option('Header', True).option('sep', '|').load('/FileStore/tables/User_Data.csv')
user_data.display()

In [0]:
from pyspark.sql.functions import *
# first 25 columns
user_data.limit(25).display()

In [0]:
# last 10 rows
last_10_rows_df = user_data.tail(10)
last_10_rows_df = spark.createDataFrame(last_10_rows_df).display()

# better performance
last_10_rows_df = user_data.orderBy(user_data.columns[0], ascending=False).limit(10).display()


In [0]:
# Printing all columns name
print(user_data.columns)

user_data.select('Occupation').distinct().display()


In [0]:
# most frequent occupation
most_frequent_occupation = user_data.groupBy('Occupation').count().orderBy(col('count'), ascending=False).limit(1).display()

In [0]:
# mean age of users
mean_age = user_data.agg(mean('age')).display()


In [0]:
# age with least occurrence
least_age= user_data.groupBy('age').count().orderBy(col('count'), ascending=True).limit(5).display()

In [0]:
# Creating df from raw data with defined schema
from pyspark.sql.types import *

raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, 62, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}

# Define the schema for the DataFrame
schema = StructType([
    StructField("regiment", StringType(), True),
    StructField("company", StringType(), True),
    StructField("deaths", IntegerType(), True),
    StructField("battles", IntegerType(), True),
    StructField("size", IntegerType(), True),
    StructField("veterans", IntegerType(), True),
    StructField("readiness", IntegerType(), True),
    StructField("armored", IntegerType(), True),
    StructField("deserters", IntegerType(), True),
    StructField("origin", StringType(), True)
])

# Create DataFrame using the raw data and the defined schema
df = spark.createDataFrame(
    [(tuple(x)) for x in zip(*raw_data.values())],
    schema=schema
)

df.display()
df.printSchema()

In [0]:
# Creating df from raw data without schema
from pyspark.sql.types import *

raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, 62, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}

# Create DataFrame using the raw data and the defined schema
df2 = spark.createDataFrame([tuple(x) for x in zip(*raw_data.values())], schema=list(raw_data.keys()))

df2.display()
df2.printSchema()

In [0]:
#  Select the 'deaths', 'size' and 'deserters' columns from Maine and Alaska
filtter = df.filter(col('origin').isin(['Maine', 'Alaska'])).select('deaths', 'size', 'deserters', 'origin').display()

In [0]:
columns = df.columns[2:6]  # Columns 3 to 6 (indexing starts at 0)
columns2 = df.columns[5:8]  # Columns 5 to 8 

df_selected_columns = df.select(*columns).display()
df_selected_columns = df.select(*columns2).display()


In [0]:
#  rows where deaths is greater than 500 or less than 50
df.filter((col("deaths") > 500) | (col("deaths") < 50)).display()


In [0]:
# all the regiments not named "Dragoons"
df.filter(col('regiment') != 'Dragoons').display()

In [0]:
# Select the third cell down in the column named deaths
row = df.select("deaths").collect()[2]["deaths"]
print(row)

# cant use display have to create df that is why using print
third_row_df = spark.createDataFrame([Row(deaths=row)]).display()
