In [7]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [8]:
#linear regression

# Load data from the CSV file
data_frame = pd.read_csv('71-80.csv')

df_linear = data_frame.copy()

citation_data = df_linear[['cit_2017', 'cit_2018', 'cit_2019', 'cit_2020', 'cit_2021']].copy()
label_linear = df_linear['cit_2022'].copy()

# Prepare the data for linear regression and split data into Train-Test (80-20)
train_features_l, test_features_l, train_label_l, test_label_l = train_test_split(citation_data, label_linear, test_size=0.2, random_state=43)

scaler_l = MinMaxScaler()
train_features_scaled_l = scaler_l.fit_transform(train_features_l)  
test_features_scaled_l = scaler_l.transform(test_features_l) 

# Linear Regression
LinearRModel = LinearRegression()
LinearRModel.fit(train_features_scaled_l, train_label_l)
prediction_l = LinearRModel.predict(test_features_scaled_l)

mae_linear = mean_absolute_error(test_label_l, prediction_l)
print(f'Linear Regression MAE: {mae_linear:.2f}')

mse_lin = mean_squared_error(test_label_l, prediction_l)
print(f'Linear Regression MSE: {mse_lin:.2f}')

predicted_results_linear = pd.DataFrame({
    'cit_2022': test_label_l,
    'Predicted_2022': prediction_l
})
print(predicted_results_linear)
# Saving the predictions to a CSV file
predicted_results_linear.to_csv('predictions_linear.csv', index=False)


Linear Regression MAE: 34.12
Linear Regression MSE: 5646.22
    cit_2022  Predicted_2022
20         6       22.135792
2        613      574.115647
15       156      119.164762
22        43       36.164586
57       269      283.754796
91       121       97.134342
69       174      176.162231
55       164      164.325709
11      1012      991.321603
79      2800     2485.015071
9         41       47.359354
38       103      178.017922
85        33       48.405717
0         65       70.300274
89       289      290.839358
13        27       31.030743
5         42       41.360167
1         47       74.791888
95       333      375.192055
83        38       66.412398


In [9]:
#logistic regression

# Load data from the CSV file
data_frame = pd.read_csv('71-80.csv')

df_logistic = data_frame.copy()

# Feature Engineering: You can experiment with additional features or transformations
# For example, you might consider adding interactions between features or creating new features

target_labels = []
for i in range(len(data_frame)):
    citation_ratio = df_logistic['cit_2022'][i] / df_logistic['cit_2021'][i]
    if citation_ratio > 1.15:
        target_labels.append("High")
    elif citation_ratio < 1.05:
        target_labels.append("Low")
    else:
        target_labels.append("Medium")

df_logistic['Label'] = target_labels

citation_data = df_logistic[['cit_2017', 'cit_2018', 'cit_2019', 'cit_2020', 'cit_2021', 'cit_2022']].copy()
label_logistic = df_logistic['Label'].copy()

# Prepare the data for logistic regression and split data into Train-Test (80-20)
train_features_lo, test_features_lo, train_label_lo, test_label_lo = train_test_split(citation_data, label_logistic, test_size=0.2, random_state=43)

scaler_lo = MinMaxScaler()
train_features_scaled_lo = scaler_lo.fit_transform(train_features_lo)
test_features_scaled_lo = scaler_lo.transform(test_features_lo)

# Hyperparameter Tuning: Tune the hyperparameters of logistic regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=100), param_grid, cv=5)
grid_search.fit(train_features_scaled_lo, train_label_lo)

best_logistic_model = grid_search.best_estimator_

# Logistic Regression with tuned hyperparameters
best_logistic_model.fit(train_features_scaled_lo, train_label_lo)
prediction_lo = best_logistic_model.predict(test_features_scaled_lo)
accuracy = accuracy_score(test_label_lo, prediction_lo)
print(f'Logistic Regression Accuracy: {accuracy * 100:.2f}%')

# Print the results
predicted_results_logistic = pd.DataFrame({
    'Actual_Label': test_label_lo,
    'Predicted_Label': prediction_lo
})

print(predicted_results_logistic)

# Saving the predictions to a CSV file
predicted_results_logistic.to_csv('predictions_logistic.csv', index=False)


Logistic Regression Accuracy: 80.00%
   Actual_Label Predicted_Label
20          Low             Low
2           Low             Low
15         High            High
22         High             Low
57          Low             Low
91         High            High
69         High             Low
55         High             Low
11          Low             Low
79         High            High
9           Low             Low
38          Low             Low
85          Low             Low
0           Low             Low
89          Low             Low
13         High             Low
5          High            High
1           Low             Low
95          Low             Low
83          Low             Low
