In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime as dt

# Import the data
df = pd.read_csv(r'D:\SL NAYAK\CUTM WORKING DOC\Atom\pacific.csv')
print(df)
print("_" * 70)

# Convert date column as datetime
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')

# Extract year from the date column
df['Year'] = df['Date'].dt.year

# Print the columns to ensure 'Year' is present
print("Columns after adding 'Year':", df.columns)

# Function to create hemisphere columns
def hemisphere(coord):
    hem = re.findall(r'[NSWE]', coord)[0]
    return 0 if hem in ['N', 'E'] else 1

# Creating the columns Latitude_Hemisphere and Longitude_Hemisphere
df['Latitude_Hemisphere'] = df['Latitude'].apply(hemisphere)
df['Longitude_Hemisphere'] = df['Longitude'].apply(hemisphere)
df['Latitude_Hemisphere'] = df['Latitude_Hemisphere'].astype('category')
df['Longitude_Hemisphere'] = df['Longitude_Hemisphere'].astype('category')

# Convert the latitude and longitude columns to numeric type
df['Latitude'] = df['Latitude'].apply(lambda x: re.match(r'\d{1,3}\.\d?', x)[0])
df['Longitude'] = df['Longitude'].apply(lambda x: re.match(r'\d{1,3}\.\d?', x)[0])

# Handle missing values
for column in df.columns:
    missing_cnt = df[column][df[column] == -999].count()
    print(f'Missing Values in column {column} = ', missing_cnt)
    if missing_cnt != 0:
        mean = round(df[column][df[column] != -999].mean())
        df.loc[df[column] == -999, column] = mean

# Restructure the dataframe for visibility and remove columns ID and Event
df = df[['ID', 'Name', 'Date', 'Time', 'Event', 'Status', 'Latitude', 'Latitude_Hemisphere',
         'Longitude', 'Longitude_Hemisphere', 'Maximum Wind', 'Minimum Pressure', 'Low Wind NE',
         'Low Wind SE', 'Low Wind SW', 'Low Wind NW', 'Moderate Wind NE',
         'Moderate Wind SE', 'Moderate Wind SW', 'Moderate Wind NW',
         'High Wind NE', 'High Wind SE', 'High Wind SW', 'High Wind NW']]

# Change all time to format HHMM
df['Time'] = df['Time'].astype('object')

def hhmm(time):
    time = str(time)
    digits = re.findall(r'\d', time)
    if len(digits) == 1:
        return f'0{time}00'
    elif len(digits) == 2:
        return f'{time}00'
    elif len(digits) == 3:
        return f'0{time}'
    else:
        return time

# Apply the function
df['Time'] = df['Time'].apply(hhmm)

# Convert the column into Datetime
df['Time'] = pd.to_datetime(df['Time'], format='%H%M').dt.time

# Convert the status column to categorical
df['Status'] = df['Status'].astype('category')

data = df.drop(columns=['ID', 'Event'])

# Print the columns of the data DataFrame
print("Columns in the 'data' DataFrame:", data.columns)
print("_" * 70)

# Display the data
print(data.head(10))
print("_" * 70)

# Find the top ten cyclones which have occurred the maximum number of times
lst = [x.strip() for x in data.groupby('Name').count().sort_values(by='Date', ascending=False).index[:10]]
val = data.groupby('Name').count().sort_values(by='Date', ascending=False)[:10]['Date'].values
font = {'family': 'monospace', 'weight': 'bold', 'size': 22}
plt.rc('font', **font)
fig, ax = plt.subplots()
fig.set_size_inches(12, 12)
ax.pie(labels=lst, x=val, autopct='%.1f%%', explode=[0.1 for _ in range(10)])
plt.title('Top Ten Hurricanes by Frequency.', fontsize=30)
plt.show()

# Year-Wise Frequency of Hurricanes
data['Month'] = data['Date'].apply(lambda x: x.month)
data['Year'] = data['Date'].apply(lambda x: x.year)
mnt = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
temp = data.groupby('Month').count()
temp.loc[4] = 0
temp = temp.sort_values(by='Month', ascending=False)
font = {'family': 'monospace', 'weight': 'bold', 'size': 22}
plt.rc('font', **font)
plt.figure(figsize=(10, 10))
sns.set_style("whitegrid")
ax = sns.barplot(x=temp.index, y='Date', data=temp, palette='RdBu')
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], mnt, rotation=90)
plt.ylabel('Frequency')
plt.title('Frequency of Cyclones by Month.')

# Probability Distribution Function of Frequency
temp = data.groupby('Year').count().sort_values(by='Date', ascending=False)
plt.figure(figsize=(15, 15))
ax = sns.histplot(temp['Date'].values, kde=True, stat='density', bins=30)
ax.set_xlabel('Probability Distribution of Frequency of Cyclones.')

# Frequency of Cyclones by Category
temp = data.groupby('Status').count().sort_values(by='Date', ascending=False)
fig, ax = plt.subplots()
fig.set_size_inches(12, 12)
sns.barplot(y=list(temp.index), x='Date', data=temp, palette='pastel')
plt.xlabel('Frequency')
plt.ylabel('Category')
plt.title('Category-wise Frequency Distribution of Cyclones.')
plt.show()






# 1. Apply normal feature selection techniques and find its importance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import seaborn as sns

# We are predicting 'Status' of the cyclone

# Drop unnecessary columns and target variable
X = data.drop(columns=['Name', 'Status', 'Date', 'Time'])
y = data['Status']

# Correlation Matrix with Heatmap
plt.figure(figsize=(15,10))
correlation_matrix = X.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Features')
plt.show()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit a RandomForestClassifier to get feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Feature importance
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for i in range(X.shape[1]):
    print(f"{i + 1}. feature {X.columns[indices[i]]} ({importances[indices[i]]})")

# Plot the feature importance
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.barh(range(X.shape[1]), importances[indices], align="center")
plt.yticks(range(X.shape[1]), [X.columns[i] for i in indices])
plt.gca().invert_yaxis()
plt.show()

# Use SelectFromModel to reduce the features
sfm = SelectFromModel(rf, threshold=0.1)  # Threshold can be adjusted
sfm.fit(X_train, y_train)
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

print("Selected features shape:", X_important_train.shape)




# 2. apply metaheuristic optimization for featureselction
from pyswarm import pso
from sklearn.model_selection import cross_val_score

# Define the objective function (minimize cross-validated error)
def objective_function(weights):
    mask = weights > 0.5  # Convert continuous values to binary
    if np.sum(mask) == 0:  # Avoid all-zero feature subset
        return 1
    X_selected = X.loc[:, mask]  # Use .loc for proper indexing
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    return 1 - cross_val_score(clf, X_selected, y, cv=5).mean()  # 1 - accuracy to minimize


# Bounds: Each feature weight between 0 and 1
bounds = [(0, 1)] * X.shape[1]

# Run PSO
best_weights, best_score = pso(objective_function, lb=[0] * X.shape[1], ub=[1] * X.shape[1], swarmsize=50, maxiter=100)

# Select the best features
mask = best_weights > 0.5
X_selected = X[:, mask]

print("Selected Features using PSO:")
print(X.columns[mask])
print("Best Score: ", 1 - best_score)

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\SL NAYAK\\CUTM WORKING DOC\\Atom\\pacific.csv'