### Creating dummy variables

In [2]:
import pandas as pd

music_df = pd.read_csv('..\\datasets\\Africa - Conflict Data\\ACLED-All-Africa-File_20170101-to-20171104_csv.csv', delimiter=',')
music_df = music_df[['YEAR', 'EVENT_TYPE', 'INTERACTION', 'FATALITIES']]
#print(music_df.head())
#print("Shape of music_df: {}".format(music_df.shape))

# Create music_dummies
EVENT_TYPE_dummies = pd.get_dummies(music_df["EVENT_TYPE"], drop_first=True) 
#COUNTRY_dummies = pd.get_dummies(music_df["COUNTRY"], drop_first=True) 

music_dummies = pd.concat([music_df, EVENT_TYPE_dummies], axis=1)
music_dummies = music_dummies.drop(["EVENT_TYPE"], axis=1)

# Print the new DataFrame's shape
#print("Shape of music_dummies: {}".format(music_dummies.shape))
#print(music_dummies.head())


### Regression with categorical features

In [3]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
import numpy as np

# Create a KFold object
kf = KFold(n_splits=6, shuffle=True, random_state=5)
# Create X and y
X = music_dummies.drop("FATALITIES", axis=1).values
y = music_dummies["FATALITIES"].values

# Instantiate a ridge model
ridge = Ridge(alpha=0.2)

# Perform cross-validation
scores = cross_val_score(ridge, X, y, cv=kf, scoring="neg_mean_squared_error")

# Calculate RMSE
rmse = np.sqrt(-scores)
print("Average RMSE: {}".format(np.mean(rmse)))
print("Standard Deviation of the target array: {}".format(np.std(y)))

Average RMSE: 7.472086831135653
Standard Deviation of the target array: 8.184679255993412


### Dropping missing data

In [44]:
# Print missing values for each column
print(music_dummies.isna().sum().sort_values())
music_dummies = music_dummies.dropna()
print(music_dummies.isna().sum().sort_values())

YEAR                                          0
INTERACTION                                   0
FATALITIES                                    0
Battle-No change of territory                 0
Battle-Non-state actor overtakes territory    0
Battle-no change of territory                 0
Headquarters or base established              0
Non-violent transfer of territory             0
Remote Violence                               0
Remote violence                               0
Riots/Protests                                0
Strategic development                         0
Violence Against Civilians                    0
Violence against civilians                    0
dtype: int64
YEAR                                          0
INTERACTION                                   0
FATALITIES                                    0
Battle-No change of territory                 0
Battle-Non-state actor overtakes territory    0
Battle-no change of territory                 0
Headquarters or base establ

### Pipeline for song genre prediction: I

In [45]:
# Import modules
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
# Instantiate an imputer
imputer = SimpleImputer()

# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=3)

# Build steps for the pipeline
steps = [("imputer", imputer), ("knn", knn)]

# Create the pipeline
pipeline = Pipeline(steps)

X = music_dummies.drop("FATALITIES", axis=1).values
y = music_dummies["FATALITIES"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

[[1759  103   11 ...    0    0    0]
 [ 235  115    4 ...    0    0    0]
 [ 119   31    2 ...    0    0    0]
 ...
 [   1    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]]


### Centering and scaling for regression

In [25]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

# Create pipeline steps
steps = [("scaler", StandardScaler()),
         ("lasso", Lasso(alpha=0.5))]

# Instantiate the pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

# Calculate and print R-squared
print(pipeline.score(X_test, y_test))

0.019496399181650248


### Centering and scaling for classification

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Build the steps
steps = [("scaler", StandardScaler()),
         ("logreg", LogisticRegression())]
pipeline = Pipeline(steps)

# Create the parameter space
#parameters = {"logreg__C": np.linspace(0.001, 1.0, 20), "logreg__max_iter":[1000]}
parameters = {"logreg__C": np.linspace(0.001, 1.0, 4), "logreg__max_iter":[1500]}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=21)

# Instantiate the grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit to the training data
cv.fit(X_train, y_train)
print(cv.best_score_, "\n", cv.best_params_)



0.652217495722605 
 {'logreg__C': 0.334, 'logreg__max_iter': 1500}


### Visualizing regression model performance

In [47]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

models = {"LogisticRegression": LogisticRegression(C=0.334, max_iter=1000), "Linear Regression": LinearRegression(), "Ridge": Ridge(alpha=0.1), "Lasso": Lasso(alpha=0.1)}
#models = {"LogisticRegression": LogisticRegression(C=0.334, max_iter=1500)}
results = []

# Loop through the models' values
for model in models.values():
  kf = KFold(n_splits=6, random_state=42, shuffle=True)
  
  # Perform cross-validation
  cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
  
  # Append the results
  results.append(cv_scores)
  
# Create a box plot of the results
plt.boxplot(results, labels=models.keys())
plt.show()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

### Predicting on the test set

In [None]:
# Import mean_squared_error
from sklearn.metrics import mean_squared_error

for name, model in models.items():
  
  # Fit the model to the training data
  model.fit(X_train_scaled, y_train)
  
  # Make predictions on the test set
  y_pred = model.predict(X_test_scaled)
  
  # Calculate the test_rmse
  test_rmse = mean_squared_error(y_test, y_pred, squared=False)
  print("{} Test Set RMSE: {}".format(name, test_rmse))

### Pipeline for predicting song popularity

In [None]:
# Create steps
steps = [("imp_mean", SimpleImputer()), 
         ("scaler", StandardScaler()), 
         ("logreg", LogisticRegression())]

# Set up pipeline
pipeline = Pipeline(steps)
params = {"logreg__solver": ["newton-cg", "saga", "lbfgs"],
         "logreg__C": np.linspace(0.001, 1.0, 10)}

# Create the GridSearchCV object
tuning = GridSearchCV(pipeline, param_grid=params)
tuning.fit(X_train, y_train)
y_pred = tuning.predict(X_test)

# Compute and print performance
print("Tuned Logistic Regression Parameters: {}, Accuracy: {}".format(tuning.best_params_, tuning.score(X_test, y_test)))