In [1]:
import pandas as pd

# Load the dataset
file_path = (r"C:\Users\riyas\OneDrive - University of Birmingham\Documents\MSc Data Science\2. group project data science\cleaned datasets\merged_turbine_crane.csv")
df = pd.read_csv(file_path)

# Display basic information about the dataset
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 11 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   WT_YEAR                       1048575 non-null  int64  
 1   WT_LATITUDE                   1048575 non-null  float64
 2   WT_LONGITUDE                  1048575 non-null  float64
 3   WT_COUNTRY                    1048575 non-null  object 
 4   CC_COUNT_POST_WT              1048575 non-null  object 
 5   CC_LATITUDE                   1048575 non-null  float64
 6   CC_LONGITUDE                  1048575 non-null  float64
 7   CC_OBSERVATION_DATE           1048575 non-null  object 
 8   CC_TIME_OBSERVATIONS_STARTED  978641 non-null   object 
 9   CC_DURATION_MINUTES           951033 non-null   float64
 10  Distance WT to CC (km)        5000 non-null     float64
dtypes: float64(6), int64(1), object(4)
memory usage: 88.0+ MB


(None,
    WT_YEAR  WT_LATITUDE  WT_LONGITUDE WT_COUNTRY CC_COUNT_POST_WT  \
 0     2023      28.3553       33.0622      Egypt               20   
 1     2023      28.3553       33.0622      Egypt                1   
 2     2023      28.3553       33.0622      Egypt                2   
 3     2023      28.3553       33.0622      Egypt              400   
 4     2023      28.3553       33.0622      Egypt                1   
 
    CC_LATITUDE  CC_LONGITUDE CC_OBSERVATION_DATE CC_TIME_OBSERVATIONS_STARTED  \
 0    26.845628     33.998315          22/10/2023                     00:34:00   
 1    28.010269     34.408268          23/12/2023                     14:31:00   
 2    27.099550     33.837988          01/04/2023                     10:00:00   
 3    27.393946     33.681893          14/03/2023                     10:45:00   
 4    27.940700     34.301800          24/12/2023                     07:20:00   
 
    CC_DURATION_MINUTES  Distance WT to CC (km)  
 0                 90.0    

In [11]:
# Convert CC_OBSERVATION_DATE to datetime format
df["CC_OBSERVATION_DATE"] = pd.to_datetime(df["CC_OBSERVATION_DATE"], errors="coerce", dayfirst=True)

# Convert CC_TIME_OBSERVATIONS_STARTED to time format, handling missing values
df["CC_TIME_OBSERVATIONS_STARTED"] = pd.to_datetime(df["CC_TIME_OBSERVATIONS_STARTED"], errors="coerce").dt.time

# Convert CC_COUNT_POST_WT to numeric (handling potential errors)
df["CC_COUNT_POST_WT"] = pd.to_numeric(df["CC_COUNT_POST_WT"], errors="coerce")

# Check how many missing values remain
missing_values = df.isnull().sum()

# Display cleaned dataset summary
df.info(), missing_values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 11 columns):
 #   Column                        Non-Null Count    Dtype         
---  ------                        --------------    -----         
 0   WT_YEAR                       1048575 non-null  int64         
 1   WT_LATITUDE                   1048575 non-null  float64       
 2   WT_LONGITUDE                  1048575 non-null  float64       
 3   WT_COUNTRY                    1048575 non-null  object        
 4   CC_COUNT_POST_WT              1048575 non-null  float64       
 5   CC_LATITUDE                   1048575 non-null  float64       
 6   CC_LONGITUDE                  1048575 non-null  float64       
 7   CC_OBSERVATION_DATE           1048575 non-null  datetime64[ns]
 8   CC_TIME_OBSERVATIONS_STARTED  0 non-null        datetime64[ns]
 9   CC_DURATION_MINUTES           1048575 non-null  float64       
 10  Distance WT to CC (km)        1048575 non-null  float64       
dty

(None,
 WT_YEAR                               0
 WT_LATITUDE                           0
 WT_LONGITUDE                          0
 WT_COUNTRY                            0
 CC_COUNT_POST_WT                      0
 CC_LATITUDE                           0
 CC_LONGITUDE                          0
 CC_OBSERVATION_DATE                   0
 CC_TIME_OBSERVATIONS_STARTED    1048575
 CC_DURATION_MINUTES                   0
 Distance WT to CC (km)                0
 dtype: int64)

In [3]:
from geopy.distance import geodesic
import numpy as np


In [4]:
# Function to calculate distance between turbine and crane locations
def calculate_distance(row):
    if np.isnan(row["Distance WT to CC (km)"]):  # Only compute if missing
        wt_coords = (row["WT_LATITUDE"], row["WT_LONGITUDE"])
        cc_coords = (row["CC_LATITUDE"], row["CC_LONGITUDE"])
        return geodesic(wt_coords, cc_coords).km
    return row["Distance WT to CC (km)"]  # Keep existing values

# Apply the distance calculation
df["Distance WT to CC (km)"] = df.apply(calculate_distance, axis=1)

In [5]:
# Impute missing CC_COUNT_POST_WT with the median count per country
df["CC_COUNT_POST_WT"] = df.groupby("WT_COUNTRY")["CC_COUNT_POST_WT"].transform(lambda x: x.fillna(x.median()))

In [6]:
# Impute missing CC_DURATION_MINUTES with the median duration per country
df["CC_DURATION_MINUTES"] = df.groupby("WT_COUNTRY")["CC_DURATION_MINUTES"].transform(lambda x: x.fillna(x.median()))

In [7]:

# Check final missing values
missing_values_after = df.isnull().sum()

# Display results
missing_values_after

WT_YEAR                             0
WT_LATITUDE                         0
WT_LONGITUDE                        0
WT_COUNTRY                          0
CC_COUNT_POST_WT                    0
CC_LATITUDE                         0
CC_LONGITUDE                        0
CC_OBSERVATION_DATE                 0
CC_TIME_OBSERVATIONS_STARTED    69934
CC_DURATION_MINUTES                 0
Distance WT to CC (km)              0
dtype: int64

In [9]:
df

Unnamed: 0,WT_YEAR,WT_LATITUDE,WT_LONGITUDE,WT_COUNTRY,CC_COUNT_POST_WT,CC_LATITUDE,CC_LONGITUDE,CC_OBSERVATION_DATE,CC_TIME_OBSERVATIONS_STARTED,CC_DURATION_MINUTES,Distance WT to CC (km)
0,2023,28.3553,33.0622,Egypt,20.0,26.845628,33.998315,2023-10-22,00:34:00,90.0,1078.067489
1,2023,28.3553,33.0622,Egypt,1.0,28.010269,34.408268,2023-12-23,14:31:00,116.0,1047.972563
2,2023,28.3553,33.0622,Egypt,2.0,27.099550,33.837988,2023-04-01,10:00:00,90.0,140.474153
3,2023,28.3553,33.0622,Egypt,400.0,27.393946,33.681893,2023-03-14,10:45:00,90.0,949.922860
4,2023,28.3553,33.0622,Egypt,1.0,27.940700,34.301800,2023-12-24,07:20:00,281.0,721.183116
...,...,...,...,...,...,...,...,...,...,...,...
1048570,2018,15.2955,76.3177,India,100.0,23.137913,71.768918,2018-01-06,16:17:00,126.0,990.927164
1048571,2018,15.2955,76.3177,India,100.0,23.207587,71.739006,2018-11-25,07:30:00,180.0,999.148949
1048572,2018,15.2955,76.3177,India,9.0,23.578610,69.318500,2018-12-09,09:30:00,120.0,1174.582263
1048573,2018,15.2955,76.3177,India,2.0,28.649528,76.608570,2018-11-19,12:26:00,212.0,1479.041902


In [10]:
df.to_csv("merged_wt_cc.csv", index=False)

Exploratory Data Analysis (EDA):

Distribution of wind turbine installations by country and year.
Common crane observation trends over time.
Spatial analysis of wind turbine locations vs. crane sightings.
Investigate Impact:

Analyse if the presence of wind turbines affects crane migration distances.
Check if certain countries or years show more significant trends.