In [None]:
### Steps 1, 2, and 4 ### I will import necessary libraries as the steps progress in the code block
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

go_fund_me = pd.read_csv('raw_data.csv', sep = ',', on_bad_lines='skip') # Skips reading the 'bad data' which allows to be assigned to a DataFrame

### Target and predictors
go_fund_me['raised'] = pd.to_numeric(go_fund_me['raised'], errors='coerce')

target = 'raised'
predictors = ['category', 'goal', 'country', 'cover_photo','num_photo_main_body' ] ### DO NOT FORGET TO ADD THE PREDICTOR FOR VECTORIZER

### To encode the categorical variables
categorical_col = ['category', 'country','cover_photo']
label_encoders = {}

model_data = go_fund_me[predictors + [target]].copy()

for col in categorical_col:
  le = LabelEncoder()
  go_fund_me[col] = le.fit_transform(go_fund_me[col].fillna('Unknown'))
  label_encoders[col] = le

### If any missing values
model_data = model_data.dropna()




In [None]:
# Step 3
##### Correlation Matrix of all the variables within the df
import pandas as pd
import numpy as np

### To only select numerical columns
numemrical_columns = go_fund_me.select_dtypes(include=[np.number]).columns

### Corr matrix
corr_m = go_fund_me[numemrical_columns].corr()
print(corr_m)

                     category      goal   country  cover_photo  \
category             1.000000  0.004256  0.031509    -0.008339   
goal                 0.004256  1.000000  0.001110     0.000155   
country              0.031509  0.001110  1.000000    -0.002472   
cover_photo         -0.008339  0.000155 -0.002472     1.000000   
num_photo_main_body -0.096581 -0.001846 -0.009740     0.004434   
raised               0.266786 -0.001641 -0.015571    -0.003117   

                     num_photo_main_body    raised  
category                       -0.096581  0.266786  
goal                           -0.001846 -0.001641  
country                        -0.009740 -0.015571  
cover_photo                     0.004434 -0.003117  
num_photo_main_body             1.000000  0.026297  
raised                          0.026297  1.000000  


In [None]:
##### To add vectorizer description data into predictors list
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(go_fund_me['clean_description'].fillna('').astype(str))

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=go_fund_me.index)

combined_data = pd.concat([go_fund_me, tfidf_df], axis=1)

### We need to add the features with the existing predictors completed in intial steps
tfidf_features = tfidf_df.columns.tolist()
updated_predictors = predictors + tfidf_features

combined_data = combined_data.dropna()

### Step 5 Now we split the data into training and testing sets after the predictors have been updated

x_combined = combined_data[updated_predictors]
y_combined = combined_data[target]
x_train_combined, x_test_combined, y_train_combined, y_test_combined = train_test_split(x_combined, y_combined, test_size=0.2, random_state=42)

### Step 6 Regression model
linear_model = LinearRegression()
linear_model.fit(x_train_combined, y_train_combined)

### Step 7 Evaluate the model to test data
y_pred_combined = linear_model.predict(x_test_combined)
mse = mean_squared_error(y_test_combined, y_pred_combined)
r2 = r2_score(y_test_combined, y_pred_combined)

mse, r2

### Step 8 Result Prediction

new_data = pd.DataFrame({
    'category': ['Memorial'],
    'goal': [7000],
    'country': ['US'],
    'cover_photo': [True],
    'num_photo_main_body': [3],
    'clean_description': ['My father has passed.'] ### This can be changed to reflect how much of a difference the amount raised can be
})

### Need to encode the new categorical data given
new_data['cover_photo'] = new_data['cover_photo'].astype(int)
for col in ['category', 'country']:
  le = label_encoders[col]
  new_data[col] = le.transform(new_data[col])

### Need to generate the tfidf features for new description
tfidf_new = vectorizer.transform(new_data['clean_description']).toarray()
tfidf_new_df = pd.DataFrame(tfidf_new, columns=vectorizer.get_feature_names_out())

### Combine the feature generated with the new data frame
new_data_combined = pd.concat([new_data.drop(columns=['clean_description']), tfidf_new_df], axis=1)

### To make sure that the new data columns have the same columns as orginal training data
new_data_combined = new_data_combined.reindex(columns=x_combined.columns, fill_value=0)

predicted_raised = linear_model.predict(new_data_combined)

print(f"Predicted Raised Amount: ${predicted_raised[0]:.2f}")





Predicted Raised Amount: $8299.68
