In [2]:
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.dates as mdates
from datetime import datetime
from matplotlib import cm

pd.set_option("display.notebook_repr_html", False)  # disable "rich" output
plt.style.use("seaborn")  # plot style

sns.set_palette([  # the "R4" palette
    "#000000", "#DF536B", "#61D04F", "#2297E6",
    "#28E2E5", "#CD0BBC", "#F5C710", "#999999"
])

plt.rcParams.update({
    "font.size":         11,
    "font.family":       "sans-serif",
    "font.sans-serif":   ["Alegreya Sans", "Alegreya"],
    "figure.autolayout": True,
    "figure.dpi":        300,
    "figure.figsize":    (6, 3.5),  # default is [8.0, 5.5],
})

In [3]:
import os, os.path
import sqlite3
baza = os.path.join('main.db')

In [4]:
Posts = pd.read_csv("travel_stackexchange_com/Posts.csv.gz", compression='gzip')
Votes = pd.read_csv("travel_stackexchange_com/Votes.csv.gz", compression='gzip')
Badges = pd.read_csv("travel_stackexchange_com/Badges.csv.gz", compression='gzip')
Users = pd.read_csv("travel_stackexchange_com/Users.csv.gz", compression='gzip')

## Zadanie 1

In [5]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
Badges.to_sql("Badges", conn)

resultSQL = pd.read_sql_query(
"""SELECT Name,
  COUNT(*) AS Number,
  MIN(Class) AS BestClass
  FROM Badges
  GROUP BY Name
  ORDER BY Number DESC
  LIMIT 10
""", conn)
conn.close()

In [6]:
res = Badges.loc[:, ("Id", "Name", "Class")]
res = res.groupby(["Name"]).agg(Number=('Name', 'count'), 
                               BestClass=('Class', 'min'))
res = res.sort_values(by=['Number'], ascending=False)
res = res.head(10)
res = res.reset_index()
resultPd = res

In [7]:
resultSQL.equals(resultPd)

True

## Zadanie 2

In [33]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
Users.to_sql("Users", conn)

resultSQL = pd.read_sql_query(
"""SELECT Location, COUNT(*) AS Count
  FROM (
        SELECT Posts.OwnerUserId, Users.Id, Users.Location
        FROM Users
        JOIN Posts ON Users.Id = Posts.OwnerUserId
        )
  WHERE Location NOT IN ('')
  GROUP BY Location
  ORDER BY Count DESC
  LIMIT 10
""", conn)
conn.close()

In [37]:
res = pd.merge( how = 'inner',
    left=Users[['Id', 'Location']], right=Posts[["OwnerUserId"]], 
    left_on="Id", right_on="OwnerUserId")
res = res[['Id', 'Location']]
res = res[pd.notna(res['Location'])]
res = res.groupby(['Location']).agg(Count=('Id', 'count'))
res = res.sort_values(by=['Count'], ascending=False)
res = res.head(10)
res = res.reset_index()
resultPd = res

In [36]:
resultSQL.equals(resultPd)

True

## Zadanie 3

In [None]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
Users.to_sql("Users", conn)
#Votes.to_sql("Votes", conn)
#Badges.to_sql("Badges", conn)

resultSQL = pd.read_sql_query(
"""SELECT
        Users.AccountId,
        Users.DisplayName,
        Users.Location,
        AVG(PostAuth.AnswersCount) as AverageAnswersCount
    FROM
        (
        SELECT
        AnsCount.AnswersCount,
        Posts.Id,
        Posts.OwnerUserId
        FROM 
            (
            SELECT Posts.ParentId, COUNT(*) AS AnswersCount
            FROM Posts
            WHERE Posts.PostTypeId = 2
            GROUP BY Posts.ParentId
            ) AS AnsCount
        JOIN Posts ON Posts.Id = AnsCount.ParentId
        ) AS PostAuth
    JOIN Users ON Users.AccountId=PostAuth.OwnerUserId
    GROUP BY OwnerUserId
    ORDER BY AverageAnswersCount DESC
    LIMIT 10
""", conn)
conn.close()

## Zadanie 4

In [None]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
#Users.to_sql("Users", conn)
Votes.to_sql("Votes", conn)
#Badges.to_sql("Badges", conn)

resultSQL = pd.read_sql_query(
"""SELECT
        Posts.Title,
        UpVotesPerYear.Year,
        MAX(UpVotesPerYear.Count) AS Count
    FROM 
        (
        SELECT
            PostId,
            COUNT(*) AS Count,
            STRFTIME('%Y', Votes.CreationDate) AS Year
        FROM Votes
        WHERE VoteTypeId=2
        GROUP BY PostId, Year
        ) AS UpVotesPerYear
    JOIN Posts ON Posts.Id=UpVotesPerYear.PostId
    WHERE Posts.PostTypeId=1
    GROUP BY Year
    ORDER BY Year ASC
""", conn)
conn.close()

## Zadanie 5

In [None]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
#Users.to_sql("Users", conn)
Votes.to_sql("Votes", conn)
#Badges.to_sql("Badges", conn)

resultSQL = pd.read_sql_query(
"""SELECT
    Posts.Title,
    VotesByAge2.OldVotes
    FROM Posts
    JOIN (
        SELECT
            PostId,
            MAX(CASE WHEN VoteDate = 'new' THEN Total ELSE 0 END) NewVotes,
            MAX(CASE WHEN VoteDate = 'old' THEN Total ELSE 0 END) OldVotes,
            SUM(Total) AS Votes
        FROM (
            SELECT
              PostId,
              CASE STRFTIME('%Y', CreationDate)
                WHEN '2021' THEN 'new'
                WHEN '2020' THEN 'new'
                ELSE 'old'
              END VoteDate,
              COUNT(*) AS Total
            FROM Votes
            WHERE VoteTypeId IN (1, 2, 5)
            GROUP BY PostId, VoteDate
            ) AS VotesByAge
        GROUP BY VotesByAge.PostId
        HAVING NewVotes=0
        ) AS VotesByAge2 ON VotesByAge2.PostId=Posts.ID
    WHERE Posts.PostTypeId=1
    ORDER BY VotesByAge2.OldVotes DESC
    LIMIT 10
""", conn)
conn.close()