In [1]:
#Project 1 

                                            #Global Power Plant Database

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, r2_score

url = "https://raw.githubusercontent.com/wri/global-power-plant-database/master/source_databases_csv/database_IND.csv"
data = pd.read_csv(url)

print(data.head())
print(data.info())
print(data.describe())

data.dropna(subset=['primary_fuel', 'capacity_mw'], inplace=True)
data.fillna(0, inplace=True)

label_encoder = LabelEncoder()
data['primary_fuel'] = label_encoder.fit_transform(data['primary_fuel'])

features = ['country_long', 'capacity_mw', 'commissioning_year', 'latitude', 'longitude']
X = data[features]
y_fuel = data['primary_fuel']
y_capacity = data['capacity_mw']

X_train, X_test, y_fuel_train, y_fuel_test = train_test_split(X, y_fuel, test_size=0.2, random_state=42)
X_train, X_test, y_capacity_train, y_capacity_test = train_test_split(X, y_capacity, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_fuel_train)
fuel_predictions = rf_classifier.predict(X_test)

print("Classification Report for Primary Fuel Prediction:")
print(classification_report(y_fuel_test, fuel_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_fuel_test, fuel_predictions))
print("Accuracy Score:", accuracy_score(y_fuel_test, fuel_predictions))

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_capacity_train)
capacity_predictions = rf_regressor.predict(X_test)

print("\nRegression Metrics for Capacity_mw Prediction:")
print("Mean Absolute Error:", mean_absolute_error(y_capacity_test, capacity_predictions))
print("Mean Squared Error:", mean_squared_error(y_capacity_test, capacity_predictions))
print("R-squared Score:", r2_score(y_capacity_test, capacity_predictions))


  country country_long                      name   gppd_idnr  capacity_mw  \
0     IND        India          ACME Solar Tower  WRI1020239          2.5   
1     IND        India       ADITYA CEMENT WORKS  WRI1019881         98.0   
2     IND        India  AES Saurashtra Windfarms  WRI1026669         39.2   
3     IND        India               AGARTALA GT  IND0000001        135.0   
4     IND        India              AKALTARA TPP  IND0000002       1800.0   

   latitude  longitude primary_fuel other_fuel1 other_fuel2  ...  \
0   28.1839    73.2407        Solar         NaN         NaN  ...   
1   24.7663    74.6090         Coal         NaN         NaN  ...   
2   21.9038    69.3732         Wind         NaN         NaN  ...   
3   23.8712    91.3602          Gas         NaN         NaN  ...   
4   21.9603    82.4091         Coal         Oil         NaN  ...   

   year_of_capacity_data  generation_gwh_2013 generation_gwh_2014  \
0                    NaN                  NaN              

ValueError: could not convert string to float: 'India'

In [2]:
#Project 2

                                    #Temperature Forecast Project using ML

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

url = "https://raw.githubusercontent.com/dsrscientist/Dataset2/main/temperature.csv"
data = pd.read_csv(url)

print(data.head())
print(data.info())
print(data.describe())

data.dropna(inplace=True)

features = ['Present_Tmax', 'Present_Tmin', 'LDAPS_RHmin', 'LDAPS_RHmax', 'LDAPS_Tmax_lapse', 'LDAPS_Tmin_lapse', 
            'LDAPS_WS', 'LDAPS_LH', 'LDAPS_CC1', 'LDAPS_CC2', 'LDAPS_CC3', 'LDAPS_CC4', 'LDAPS_PPT1', 'LDAPS_PPT2', 
            'LDAPS_PPT3', 'LDAPS_PPT4', 'lat', 'lon', 'DEM', 'Slope', 'Solar radiation']
X = data[features]
y_max = data['Next_Tmax']
y_min = data['Next_Tmin']

X_train, X_test, y_max_train, y_max_test = train_test_split(X, y_max, test_size=0.2, random_state=42)
X_train, X_test, y_min_train, y_min_test = train_test_split(X, y_min, test_size=0.2, random_state=42)

rf_regressor_max = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor_max.fit(X_train, y_max_train)
max_temp_predictions = rf_regressor_max.predict(X_test)

print("\nRegression Metrics for Next_Tmax Prediction:")
print("Mean Absolute Error:", mean_absolute_error(y_max_test, max_temp_predictions))
print("Mean Squared Error:", mean_squared_error(y_max_test, max_temp_predictions))
print("R-squared Score:", r2_score(y_max_test, max_temp_predictions))

rf_regressor_min = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor_min.fit(X_train, y_min_train)
min_temp_predictions = rf_regressor_min.predict(X_test)

print("\nRegression Metrics for Next_Tmin Prediction:")
print("Mean Absolute Error:", mean_absolute_error(y_min_test, min_temp_predictions))
print("Mean Squared Error:", mean_squared_error(y_min_test, min_temp_predictions))
print("R-squared Score:", r2_score(y_min_test, min_temp_predictions))


   station        Date  Present_Tmax  Present_Tmin  LDAPS_RHmin  LDAPS_RHmax  \
0      1.0  30-06-2013          28.7          21.4    58.255688    91.116364   
1      2.0  30-06-2013          31.9          21.6    52.263397    90.604721   
2      3.0  30-06-2013          31.6          23.3    48.690479    83.973587   
3      4.0  30-06-2013          32.0          23.4    58.239788    96.483688   
4      5.0  30-06-2013          31.4          21.9    56.174095    90.155128   

   LDAPS_Tmax_lapse  LDAPS_Tmin_lapse  LDAPS_WS    LDAPS_LH  ...  LDAPS_PPT2  \
0         28.074101         23.006936  6.818887   69.451805  ...         0.0   
1         29.850689         24.035009  5.691890   51.937448  ...         0.0   
2         30.091292         24.565633  6.138224   20.573050  ...         0.0   
3         29.704629         23.326177  5.650050   65.727144  ...         0.0   
4         29.113934         23.486480  5.735004  107.965535  ...         0.0   

   LDAPS_PPT3  LDAPS_PPT4      lat    

In [3]:
#Project 3

                                    #Loan Application Status Prediction

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

url = "https://raw.githubusercontent.com/dsrscientist/DSData/master/loan_prediction.csv"
data = pd.read_csv(url)

print(data.head())
print(data.info())
print(data.describe())

data.fillna(method='ffill', inplace=True)

label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Married'] = label_encoder.fit_transform(data['Married'])
data['Education'] = label_encoder.fit_transform(data['Education'])
data['Self_Employed'] = label_encoder.fit_transform(data['Self_Employed'])
data['Property_Area'] = label_encoder.fit_transform(data['Property_Area'])
data['Loan_Status'] = label_encoder.fit_transform(data['Loan_Status'])

features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 
            'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']
X = data[features]
y = data['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
loan_status_predictions = rf_classifier.predict(X_test)

print("Classification Report for Loan Status Prediction:")
print(classification_report(y_test, loan_status_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, loan_status_predictions))
print("Accuracy Score:", accuracy_score(y_test, loan_status_predictions))


    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

ValueError: could not convert string to float: '3+'