#### 1. Import pandas library

In [9]:
import pandas as pd

#### 2. Import users table:

In [10]:
users = pd.read_csv('users_table.csv')

#### 3. Rename Id column to userId

In [15]:
users = users.rename(columns={'Id': 'userId'})

# Show the first 5 rows to confirm the change
print(users.head())

   userId  Reputation         CreationDate   DisplayName       LastAccessDate  \
0      -1           1  2010-07-19 06:55:26     Community  2010-07-19 06:55:26   
1       2         101  2010-07-19 14:01:36  Geoff Dalgas  2013-11-12 22:07:23   
2       3         101  2010-07-19 15:34:50  Jarrod Dixon  2014-08-08 06:42:58   
3       4         101  2010-07-19 19:03:27        Emmett  2014-01-02 09:31:02   
4       5        6792  2010-07-19 19:03:57         Shane  2014-08-13 00:23:47   

                       WebsiteUrl            Location  \
0  http://meta.stackexchange.com/  on the server farm   
1        http://stackoverflow.com       Corvallis, OR   
2        http://stackoverflow.com        New York, NY   
3    http://minesweeperonline.com   San Francisco, CA   
4         http://www.statalgo.com        New York, NY   

                                             AboutMe  Views  UpVotes  \
0  <p>Hi, I'm not really a person.</p>\r\n\r\n<p>...      0     5007   
1  <p>Developer on the Sta

#### 4. Import posts table:

In [16]:
posts = pd.read_csv('posts_table.csv')

#### 5. Rename Id column to postId and OwnerUserId to userId

In [17]:
posts = posts.rename(columns={'Id': 'postId', 'OwnerUserId': 'userId'})

# Show the first 5 rows of posts to check column names
print(posts.head())

   postId  PostTypeId  AcceptedAnswerId          CreaionDate  Score  \
0       1           1              15.0  2010-07-19 19:12:12     23   
1       2           1              59.0  2010-07-19 19:12:57     22   
2       3           1               5.0  2010-07-19 19:13:28     54   
3       4           1             135.0  2010-07-19 19:13:31     13   
4       5           2               NaN  2010-07-19 19:14:43     81   

   ViewCount                                               Body  userId  \
0     1278.0  <p>How should I elicit prior distributions fro...     8.0   
1     8198.0  <p>In many different statistical methods there...    24.0   
2     3613.0  <p>What are some valuable Statistical Analysis...    18.0   
3     5224.0  <p>I have two groups of data.  Each with a dif...    23.0   
4        NaN  <p>The R-project</p>\n\n<p><a href="http://www...    23.0   

       LasActivityDate                                              Title  \
0  2010-09-15 21:08:26                      E

#### 6. Define new dataframes for users and posts with the following selected columns:
    **users columns**: userId, Reputation,Views,UpVotes,DownVotes
    **posts columns**: postId, Score,userId,ViewCount,CommentCount

In [19]:
users_selected = users[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']] # Create a new users dataframe with selected columns

posts_selected = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']] # Create a new posts dataframe with selected columns

# Show first rows of the new users dataframe
print(users_selected.head())

# Show first rows of the new posts dataframe
print(posts_selected.head())

   userId  Reputation  Views  UpVotes  DownVotes
0      -1           1      0     5007       1920
1       2         101     25        3          0
2       3         101     22       19          0
3       4         101     11        0          0
4       5        6792   1145      662          5
   postId  Score  userId  ViewCount  CommentCount
0       1     23     8.0     1278.0             1
1       2     22    24.0     8198.0             1
2       3     54    18.0     3613.0             4
3       4     13    23.0     5224.0             2
4       5     81    23.0        NaN             3


#### 7. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [21]:
merged_df = posts_selected.merge(users_selected, on='userId', how='inner') # Merge posts and users dataframes on 'userId'

print(merged_df.head())

   postId  Score  userId  ViewCount  CommentCount  Reputation  Views  UpVotes  \
0       1     23     8.0     1278.0             1        6764   1089      604   
1       2     22    24.0     8198.0             1         344     48       36   
2       3     54    18.0     3613.0             4         128      8       16   
3       4     13    23.0     5224.0             2         308     52       34   
4       5     81    23.0        NaN             3         308     52       34   

   DownVotes  
0         25  
1          1  
2          0  
3          1  
4          1  


#### 8. How many missing values do you have in your merged dataframe? On which columns?

In [22]:
missing_values = merged_df.isnull().sum() # Check how many missing values there are in each column

print(missing_values[missing_values > 0]) # Display only columns that have missing values

ViewCount    23572
dtype: int64


#### 9. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before passing to the next step

In [27]:
merged_df['ViewCount'] = merged_df['ViewCount'].fillna(0)  #Even though I used an inner merge, some columns in the posts dataframe (like ViewCount) already had missing values. To handle this, I decided to fill the missing values in ViewCount with 0, assuming that posts without view count recorded likely had zero views or incomplete data.

print(merged_df.isnull().sum()) # Check again for missing values after filling

postId          0
Score           0
userId          0
ViewCount       0
CommentCount    0
Reputation      0
Views           0
UpVotes         0
DownVotes       0
dtype: int64


#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [28]:
print(merged_df.dtypes) # Check current data types


postId            int64
Score             int64
userId          float64
ViewCount       float64
CommentCount      int64
Reputation        int64
Views             int64
UpVotes           int64
DownVotes         int64
dtype: object


In [29]:
# Convert userId and ViewCount to integer. These columns should be integers because they represent IDs and countable values. 
merged_df['userId'] = merged_df['userId'].astype(int)
merged_df['ViewCount'] = merged_df['ViewCount'].astype(int)

In [30]:
print(merged_df.dtypes) # Check current data types again

postId          int64
Score           int64
userId          int64
ViewCount       int64
CommentCount    int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
dtype: object
