Added K-Means clustering method in YearlyRainfall class

paul-florentin-charles · Jul 24, 2023 · e163512 · e163512
1 parent 7bb28dc
commit e163512
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 11 deletions.
diff --git a/src/classes/yearly_rainfall.py b/src/classes/yearly_rainfall.py
@@ -10,6 +10,7 @@
 from scipy import signal
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import r2_score
+from sklearn.cluster import KMeans
 
 import src.config as cfg
 from src.decorators import plots
@@ -29,7 +30,7 @@ def __init__(self,
         self.starting_year: int = cfg.get_start_year() \
             if start_year is None \
             else start_year
-        self.round_precision: int = cfg.get_round_precision() \
+        self.round_precision: int = cfg.get_rainfall_precision() \
             if round_precision is None \
             else round_precision
         self.yearly_rainfall: pd.DataFrame = self.load_yearly_rainfall() \
@@ -209,7 +210,7 @@ def add_linear_regression(self) -> (float, float):
         years: np.ndarray = self.yearly_rainfall[Label.YEAR.value].values.reshape(-1, 1)
         rainfalls: np.ndarray = self.yearly_rainfall[Label.RAINFALL.value].values
 
-        reg = LinearRegression()
+        reg: LinearRegression = LinearRegression()
         reg.fit(years, rainfalls)
         self.yearly_rainfall[Label.LINEAR_REGRESSION.value] = reg.predict(years)
         self.yearly_rainfall[Label.LINEAR_REGRESSION.value] = round(
@@ -235,6 +236,19 @@ def add_savgol_filter(self) -> None:
         self.yearly_rainfall[Label.SAVITZKY_GOLAY_FILTER.value] = round(
             self.yearly_rainfall[Label.SAVITZKY_GOLAY_FILTER.value], self.round_precision)
 
+    def add_kmeans(self) -> None:
+        """
+        Compute and add K-Mean clustering of Rainfallc according to Year
+        to our pandas DataFrame.
+
+        :return: None
+        """
+        fit_data: np.ndarray = self.yearly_rainfall[[Label.YEAR.value, Label.RAINFALL.value]].values
+
+        kmeans: KMeans = KMeans(n_init=10, n_clusters=cfg.get_kmeans_clusters())
+        kmeans.fit(fit_data)
+        self.yearly_rainfall[Label.KMEANS.value] = kmeans.predict(fit_data)
+
     @plots.legend_and_show()
     def plot_rainfall(self, title: Optional[str] = None) -> None:
         """
@@ -244,7 +258,7 @@ def plot_rainfall(self, title: Optional[str] = None) -> None:
         :return: None
         """
         for column_label in self.yearly_rainfall.columns[1:]:
-            if column_label == Label.PERCENTAGE_OF_NORMAL.value:
+            if column_label in [Label.PERCENTAGE_OF_NORMAL.value, Label.KMEANS.value]:
                 continue
 
             plt.plot(self.yearly_rainfall[Label.YEAR.value],
@@ -268,9 +282,17 @@ def plot_normal(self, title: Optional[str] = None) -> None:
             return
 
         plt.axhline(y=100.0, color='orange', linestyle='dashed', label='Normal')
-        plt.scatter(self.yearly_rainfall[Label.YEAR.value],
-                    self.yearly_rainfall[Label.PERCENTAGE_OF_NORMAL.value],
-                    label=Label.PERCENTAGE_OF_NORMAL.value)
+        if Label.KMEANS.value not in self.yearly_rainfall.columns:
+            plt.scatter(self.yearly_rainfall[Label.YEAR.value],
+                        self.yearly_rainfall[Label.PERCENTAGE_OF_NORMAL.value],
+                        label=Label.PERCENTAGE_OF_NORMAL.value)
+        else:
+            year_rain: pd.DataFrame = self.yearly_rainfall
+            for label_value in range(cfg.get_kmeans_clusters()):
+                year_rain = year_rain[year_rain[Label.KMEANS.value] == label_value]
+                plt.scatter(year_rain[Label.YEAR.value],
+                            year_rain[Label.PERCENTAGE_OF_NORMAL.value])
+                year_rain = self.yearly_rainfall
 
         if title is not None:
             plt.title(title)

diff --git a/src/config.py b/src/config.py
@@ -42,7 +42,7 @@ def get_start_year() -> int:
     return start_year
 
 
-def get_round_precision() -> int:
+def get_rainfall_precision() -> int:
     """
     The decimal precision of Rainfall values.
 
@@ -51,6 +51,20 @@ def get_round_precision() -> int:
     rounding_precision: int = 0
     with open(CONFIG_FILE_PATH, mode=MODE, encoding=UTF_8) as stream:
         yaml_config: dict = safe_load(stream)
-        rounding_precision += yaml_config['data']['round_precision']
+        rounding_precision += yaml_config['data']['rainfall_precision']
 
     return rounding_precision
+
+
+def get_kmeans_clusters() -> int:
+    """
+    The number of clusters to use for K-Means clustering of Rainfall data.
+
+    :return: A number of clusters as an Integer.
+    """
+    n_kmeans_clusters: int = 0
+    with open(CONFIG_FILE_PATH, mode=MODE, encoding=UTF_8) as stream:
+        yaml_config: dict = safe_load(stream)
+        n_kmeans_clusters += yaml_config['data']['kmeans_clusters']
+
+    return n_kmeans_clusters
diff --git a/src/config.yaml b/src/config.yaml
@@ -7,4 +7,5 @@ dataset:
 
 data:
   start_year: 1970
-  round_precision: 2
+  rainfall_precision: 2
+  kmeans_clusters: 4
diff --git a/src/enums/labels.py b/src/enums/labels.py
@@ -14,3 +14,4 @@ class Label(str, Enum):
     PERCENTAGE_OF_NORMAL: str = 'Percentage of normal'
     LINEAR_REGRESSION: str = 'Linear regression'
     SAVITZKY_GOLAY_FILTER: str = 'Savitzky–Golay filter'
+    KMEANS: str = 'K-Means'
diff --git a/src/run.py b/src/run.py
@@ -20,12 +20,15 @@ def run() -> None:
     by_year: bool = False
     by_month: bool = False
 
+    month: Month = Month.JULY
+    season: Season = Season.SUMMER
+
     if by_year:
         yearly_rainfall: YearlyRainfall = YearlyRainfall()
     elif by_month:
-        yearly_rainfall: MonthlyRainfall = MonthlyRainfall(Month.JANUARY)
+        yearly_rainfall: MonthlyRainfall = MonthlyRainfall(month)
     else:
-        yearly_rainfall: SeasonalRainfall = SeasonalRainfall(Season.WINTER)
+        yearly_rainfall: SeasonalRainfall = SeasonalRainfall(season)
 
     avg_1970_2000 = yearly_rainfall.get_average_yearly_rainfall(1970, 2000)
     avg_1980_2010 = yearly_rainfall.get_average_yearly_rainfall(1980, 2010)
@@ -45,6 +48,7 @@ def run() -> None:
     print("Slope (in mm/year):", slope)
 
     yearly_rainfall.add_savgol_filter()
+    yearly_rainfall.add_kmeans()
 
     yearly_rainfall.plot_rainfall()
     yearly_rainfall.plot_normal()