# Data Cleaning 

#### 1. Import pandas library.

In [None]:
import numpy as np
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data.

In [None]:
import pymysql
from sqlalchemy import create_engine

#### 3. Create a mysql engine to set the connection to the server. 
Check the connection details here [here](https://relational.fit.cvut.cz/dataset/Stats)

In [None]:
engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306/stats')

#### 4. Import the users table.

In [None]:
users = pd.read_sql_query('SELECT * FROM stats.users', engine)

In [None]:
users.head()

#### 5. Rename Id column to userId.

In [None]:
users = users.rename(columns = {'Id' : 'userId'})

In [None]:
users.head()

#### 6. Import the posts table. 

In [None]:
posts = pd.read_sql_query('SELECT * FROM stats.posts', engine)

In [None]:
posts.head()

#### 7. Rename Id column to postId and OwnerUserId to userId.

In [None]:
posts = posts.rename(columns = {'Id' : 'postId', 'OwnerUserId' : 'userId'})

In [None]:
posts.head()

#### 8. Define new dataframes for users and posts with the following selected columns:
**users columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts columns**: postId, Score, userId, ViewCount, CommentCount

In [None]:
users_2 = users[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']].copy()
users_2.head()

In [None]:
posts_2 = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']].copy()
posts_2.head()

#### 9. Merge the new dataframes you have created, of users and posts. 
You will need to make an inner [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [None]:
users_posts_merge =users_2.merge(posts_2, how='inner')
users_posts_merge.head()

#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [None]:
users_posts_merge.info()

In [None]:
users_posts_merge.isnull().sum()
#I have 48396 missing values for the column ViewCount

In [None]:
users_posts_merge.isnull()

#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [None]:
users_posts_merge = users_posts_merge.fillna(0)
users_posts_merge
#More than 50% of the values for ViewCount are missing therefore it 
#would have a big impact on our database if we delete more than half 
#of it. Therefere it makes sense to fill the missing values with 0


#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [None]:
users_posts_merge = users_posts_merge.astype({"ViewCount": int})
#ViewCount should be changed to integers

In [None]:
users_posts_merge

In [None]:
users_posts_merge.info()