## Setup

In [None]:
!pip install kaggle
!pip install opendatasets
!pip install pandas

In [None]:
import opendatasets as od
 
od.download("https://www.kaggle.com/datasets/stackoverflow/stacksample/")


In [2]:
import pandas as pd
from io import StringIO

In [3]:
# Importing the Questions.csv file

questions_df = pd.read_csv('stacksample/Questions.csv', encoding = "ISO-8859-1")
questions_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [8]:
# Importing the Answers.csv file

answers_df = pd.read_csv('stacksample/Answers.csv', encoding = "ISO-8859-1")
answers_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49.0,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."


In [6]:
# Removing the Tags.csv file beacuse we don't need it
import os

os.remove("stacksample/Tags.csv")

## Data preprocessing

In [22]:
## Grouping answers by ParentId and calculating the sum of scores from Answers
total_answers_score = answers_df.groupby('ParentId')['Score'].sum().reset_index()
total_answers_score.rename(columns={'Score': 'TotalAnswersScore', 'ParentId': 'Id'}, inplace=True)

merged_df = questions_df.merge(total_answers_score, on='Id', how='left')

merged_df['TotalAnswersScore'] = merged_df['TotalAnswersScore'].fillna(0)

merged_df['Closed'] = merged_df['ClosedDate'].notnull()

columns_to_remove = ['OwnerUserId', 'CreationDate', 'ClosedDate']
merged_df = merged_df.drop(columns=columns_to_remove)

# Saving the merged data to a new CSV file
output_file_path = './Questions_with_TotalAnswersScore.csv'
merged_df.to_csv(output_file_path, index=False)

In [23]:
modified_qustions_df = pd.read_csv(output_file_path, encoding = "ISO-8859-1")
modified_qustions_df.head()

Unnamed: 0,Id,Score,Title,Body,TotalAnswersScore,Closed
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,19.0,False
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,34.0,True
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,9.0,False
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,54.0,False
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,63.0,False


In [None]:
import gc 
gc.collect()
del questions_df
del answers_df
del merged_df