In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('daily_weather').getOrCreate()

In [23]:
daily_weather_df = spark.read.csv('/content/drive/MyDrive/daily_weather.csv', header=True, inferSchema=True)

In [24]:
daily_weather_df.show()

+------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|number| air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|     0|918.0600000000087| 74.82200000000041|                 271.1| 2.080354199999768|    295.39999999999986| 2.863283199999908|                  0.0|              0.0|    42.42000000000046|   36.160000000000494|
|     1|917.3476881177097| 71.40384263106537|    101.93517935618371|2.4430092157340217|    140.47154847112498|3.5333236016106238|               

In [25]:
daily_weather_df.printSchema()

root
 |-- number: integer (nullable = true)
 |-- air_pressure_9am: double (nullable = true)
 |-- air_temp_9am: double (nullable = true)
 |-- avg_wind_direction_9am: double (nullable = true)
 |-- avg_wind_speed_9am: double (nullable = true)
 |-- max_wind_direction_9am: double (nullable = true)
 |-- max_wind_speed_9am: double (nullable = true)
 |-- rain_accumulation_9am: double (nullable = true)
 |-- rain_duration_9am: double (nullable = true)
 |-- relative_humidity_9am: double (nullable = true)
 |-- relative_humidity_3pm: double (nullable = true)



In [26]:
daily_weather_df.describe().show()

+-------+------------------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary|            number| air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|              1095|             1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|              1092|                 1095|                 1095|
|   mean|             547.0|918.8825513138094| 64.9330014128

In [27]:
daily_weather_df.corr("rain_accumulation_9am", "rain_duration_9am")

0.733796878331099

In [28]:
daily_weather_df_without_null = daily_weather_df.na.drop()

In [29]:
daily_weather_df_without_null.count()

1064

In [30]:
daily_weather_df_without_null.describe("air_temp_9am").show()

+-------+------------------+
|summary|      air_temp_9am|
+-------+------------------+
|  count|              1064|
|   mean| 65.02260949558733|
| stddev|11.168033449415704|
|    min|36.752000000000685|
|    max| 98.90599999999992|
+-------+------------------+



In [31]:
daily_weather_df.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [32]:
mean_values = {col_name: daily_weather_df.agg({col_name: "mean"}).collect()[0][0] for col_name in daily_weather_df.columns}

In [33]:
print(mean_values)

{'number': 547.0, 'air_pressure_9am': 918.8825513138094, 'air_temp_9am': 64.93300141287072, 'avg_wind_direction_9am': 142.2355107005759, 'avg_wind_speed_9am': 5.50828424225493, 'max_wind_direction_9am': 148.95351796516923, 'max_wind_speed_9am': 7.019513529175272, 'rain_accumulation_9am': 0.20307895225211126, 'rain_duration_9am': 294.1080522756142, 'relative_humidity_9am': 34.24140205923536, 'relative_humidity_3pm': 35.34472714825898}


In [34]:
for col_name, mean_value in mean_values.items():
  daily_weather_df_replace_mean = daily_weather_df.na.fill(mean_value, [col_name])

In [35]:
daily_weather_df_replace_mean.show()

+------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|number| air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|     0|918.0600000000087| 74.82200000000041|                 271.1| 2.080354199999768|    295.39999999999986| 2.863283199999908|                  0.0|              0.0|    42.42000000000046|   36.160000000000494|
|     1|917.3476881177097| 71.40384263106537|    101.93517935618371|2.4430092157340217|    140.47154847112498|3.5333236016106238|               

In [36]:
daily_weather_df_replace_mean.count()

1095

In [37]:
daily_weather_df_replace_mean.describe().show()

+-------+------------------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary|            number| air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|              1095|             1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|              1092|                 1095|                 1095|
|   mean|             547.0|918.8825513138094| 64.9330014128

In [38]:
daily_weather_df_replace_mean.summary().show()

+-------+------------------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary|            number| air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|              1095|             1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|              1092|                 1095|                 1095|
|   mean|             547.0|918.8825513138094| 64.9330014128

In [39]:
daily_weather_df_classification = daily_weather_df.na.drop()

In [41]:
daily_weather_df_classification = daily_weather_df_classification.drop("number")

In [42]:
daily_weather_df_classification.show()

+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
| air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|918.0600000000087| 74.82200000000041|                 271.1| 2.080354199999768|    295.39999999999986| 2.863283199999908|                  0.0|              0.0|    42.42000000000046|   36.160000000000494|
|917.3476881177097| 71.40384263106537|    101.93517935618371|2.4430092157340217|    140.47154847112498|3.5333236016106238|                  0.0|              0.0|   24.3286

In [47]:
from pyspark.sql.functions import when, col

# Create a new categorical column: 1 if Humidity < 30, else 0
daily_weather_df_classification = daily_weather_df_classification.withColumn("low_humidity", when(col("relative_humidity_3pm") < 30, 1).otherwise(0))

# Show updated DataFrame
daily_weather_df_classification.select("low_humidity", "relative_humidity_3pm").show(10)

+------------+---------------------+
|low_humidity|relative_humidity_3pm|
+------------+---------------------+
|           0|   36.160000000000494|
|           1|     19.4265967985621|
|           1|   14.460000000000045|
|           1|   12.742547353761848|
|           0|    76.74000000000046|
|           0|   33.930000000000256|
|           1|   21.385656725200974|
|           0|    74.92000000000041|
|           1|   24.030000000000427|
|           0|     68.0500000000012|
+------------+---------------------+
only showing top 10 rows



In [48]:
train_df, test_df = daily_weather_df_classification.randomSplit([0.8, 0.2], seed=42)

In [49]:
train_df.count()

886

In [50]:
test_df.count()

178

In [51]:
from pyspark.ml.feature import VectorAssembler

feature_cols = ["air_pressure_9am", "air_temp_9am", "avg_wind_speed_9am",
                "max_wind_speed_9am", "rain_accumulation_9am", "relative_humidity_9am"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Biến đổi dữ liệu
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

# Chỉ giữ lại cột cần thiết
train_df = train_df.select("features", "low_humidity")
test_df = test_df.select("features", "low_humidity")


In [52]:
from pyspark.ml.classification import DecisionTreeClassifier

# Tạo mô hình Decision Tree
dt = DecisionTreeClassifier(labelCol="low_humidity", featuresCol="features")

# Huấn luyện mô hình
dt_model = dt.fit(train_df)


In [53]:
predictions = dt_model.transform(test_df)

# Hiển thị 10 dòng đầu tiên
predictions.select("features", "low_humidity", "prediction").show(10)


+--------------------+------------+----------+
|            features|low_humidity|prediction|
+--------------------+------------+----------+
|[908.970000000004...|           0|       0.0|
|[910.450000000002...|           0|       0.0|
|[911.000000000007...|           1|       1.0|
|[911.680000000008...|           0|       0.0|
|[912.410000000007...|           0|       0.0|
|[912.650000000008...|           0|       0.0|
|[912.900000000012...|           0|       0.0|
|[913.070000000008...|           0|       0.0|
|[913.490000000008...|           0|       0.0|
|[913.500000000004...|           0|       0.0|
+--------------------+------------+----------+
only showing top 10 rows



In [54]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="low_humidity", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Độ chính xác của mô hình: {accuracy:.2f}")


Độ chính xác của mô hình: 0.87


In [56]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define a UDF to convert the vector to a string representation
vector_to_string = udf(lambda v: str(v), StringType())

# Apply the UDF to the 'features' column
predictions = predictions.withColumn("features_str", vector_to_string("features"))

# Select the desired columns, including the new string representation
predictions.select("features_str", "low_humidity", "prediction").write.csv("predictions.csv", header=True)