# Data Cleaning 

#### 1. Import pandas library.

In [None]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data. 


In [None]:
import pymysql
from sqlalchemy import create_engine

#### 3. Create a mysql engine to set the connection to the server. 
Check the connection details here [here](https://relational.fit.cvut.cz/dataset/Stats)

In [None]:
engine = create_engine('mysql+pymysql://sreelatha:password@localhost/stats')

#### 4. Import the users table.

In [None]:
data = pd.read_sql_query('SELECT * FROM stats.users', engine)
print(data)

#### 5. Rename Id column to userId.

In [None]:
engine.execute("ALTER TABLE users CHANGE Id userId int(11)")


#### 6. Import the posts table. 

In [None]:
posts = pd.read_sql_query('SELECT * FROM stats.posts', engine)
print(posts)

#### 7. Rename Id column to postId and OwnerUserId to userId.

In [None]:
engine.execute("ALTER TABLE posts CHANGE Id postId int(11)")

#### 8. Define new dataframes for users and posts with the following selected columns:
**users columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts columns**: postId, Score, userID, ViewCount, CommentCount

In [None]:
usercolumns = pd.read_sql_query('SELECT userId, Reputation, Views, UpVotes, DownVotes FROM stats.users', engine)
print(usercolumns)
postscolumns = pd.read_sql_query('SELECT posts.postId, posts.Score, comments.userId, posts.ViewCount, posts.CommentCount FROM stats.posts, stats.comments WHERE stats.posts.postId = stats.comments.postId', engine)
print(postscolumns)

#### 9. Merge the new dataframes you have created, of users and posts. 
You will need to make an inner [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [None]:
mergedcolumn = pd.merge(usercolumns, postscolumns, how ='inner', on ='userId')
mergedcolumn

#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [None]:
null_cols = mergedcolumn.isnull().sum()
print(null_cols)

#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [None]:
# Because there are 82,578 NaN viewcounts, an assumption is made that no one viewed the posts. So, we make the NaN values to zero.

mergedcolumn[['ViewCount']] = mergedcolumn[['ViewCount']].fillna(0)

mergedcolumn

#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [None]:
mergedcolumn.dtypes


In [None]:
# In the database (in mysql) the ViewCount shows as int(11) and in this dataframe is shows float64. 
# So, we have to change the data type of ViewCount from float to integer. 
#View Count is an integer as it count the number of views and it makes no sense in have 1.5 viewcount ot 5.5 viewcount.

mergedcolumn['ViewCount'] = mergedcolumn['ViewCount'].astype('int')
mergedcolumn['ViewCount'].dtype