In [6]:
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.dates as mdates
from datetime import datetime
from matplotlib import cm

pd.set_option("display.notebook_repr_html", False)  # disable "rich" output
plt.style.use("seaborn")  # plot style

sns.set_palette([  # the "R4" palette
    "#000000", "#DF536B", "#61D04F", "#2297E6",
    "#28E2E5", "#CD0BBC", "#F5C710", "#999999"
])

plt.rcParams.update({
    "font.size":         11,
    "font.family":       "sans-serif",
    "font.sans-serif":   ["Alegreya Sans", "Alegreya"],
    "figure.autolayout": True,
    "figure.dpi":        300,
    "figure.figsize":    (6, 3.5),  # default is [8.0, 5.5],
})

In [None]:
import os, os.path
import sqlite3
baza = os.path.join('main.db')

In [11]:
Posts = pd.read_csv("travel_stackexchange_com/Posts.csv.gz", compression='gzip')
Votes = pd.read_csv("travel_stackexchange_com/Votes.csv.gz", compression='gzip')
Badges = pd.read_csv("travel_stackexchange_com/Badges.csv.gz", compression='gzip')
Users = pd.read_csv("travel_stackexchange_com/Users.csv.gz", compression='gzip')

## Zadanie 1

In [51]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
#Users.to_sql("Users", conn)
#Votes.to_sql("Votes", conn)
Badges.to_sql("Badges", conn)

print(pd.read_sql_query(
"""SELECT Name,
  COUNT(*) AS Number,
  MIN(Class) AS BestClass
  FROM Badges
  GROUP BY Name
  ORDER BY Number DESC
  LIMIT 10
""", conn) )
conn.close()

               Name  Number  BestClass
0    Autobiographer   24564          3
1           Student   23930          3
2         Supporter   17826          3
3  Popular Question   14840          3
4       Nice Answer   12406          3
5          Yearling   11281          2
6           Teacher    9746          3
7            Editor    9115          3
8  Notable Question    8613          2
9           Scholar    7224          3


## Zadanie 2

In [53]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
Users.to_sql("Users", conn)
#Votes.to_sql("Votes", conn)
#Badges.to_sql("Badges", conn)

print(pd.read_sql_query(
"""SELECT Location, COUNT(*) AS Count
  FROM (
        SELECT Posts.OwnerUserId, Users.Id, Users.Location
        FROM Users
        JOIN Posts ON Users.Id = Posts.OwnerUserId
        )
  WHERE Location NOT IN ('')
  GROUP BY Location
  ORDER BY Count DESC
  LIMIT 10
""", conn) )
conn.close()

                        Location  Count
0      Christchurch, New Zealand   2765
1                   New York, NY   1788
2         London, United Kingdom   1708
3                             UK   1590
4  Sunshine Coast QLD, Australia   1550
5                      Australia   1183
6              Vancouver, Canada    967
7                    Netherlands    935
8             on the server farm    924
9                   Pennsylvania    921


## Zadanie 3

In [54]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
Users.to_sql("Users", conn)
#Votes.to_sql("Votes", conn)
#Badges.to_sql("Badges", conn)

print(pd.read_sql_query(
"""SELECT
        Users.AccountId,
        Users.DisplayName,
        Users.Location,
        AVG(PostAuth.AnswersCount) as AverageAnswersCount
    FROM
        (
        SELECT
        AnsCount.AnswersCount,
        Posts.Id,
        Posts.OwnerUserId
        FROM 
            (
            SELECT Posts.ParentId, COUNT(*) AS AnswersCount
            FROM Posts
            WHERE Posts.PostTypeId = 2
            GROUP BY Posts.ParentId
            ) AS AnsCount
        JOIN Posts ON Posts.Id = AnsCount.ParentId
        ) AS PostAuth
    JOIN Users ON Users.AccountId=PostAuth.OwnerUserId
    GROUP BY OwnerUserId
    ORDER BY AverageAnswersCount DESC
    LIMIT 10
""", conn) )
conn.close()

   AccountId      DisplayName           Location  AverageAnswersCount
0    40811.0           vocaro       San Jose, CA                 11.0
1      280.0            csmba  San Francisco, CA                 11.0
2    44093.0    Emma Arbogast          Salem, OR                 10.0
3      204.0             Josh          Australia                 10.0
4    11758.0         rvarcher  Oklahoma City, OK                  9.0
5    79346.0  Thomas Matthews         California                  8.0
6    54571.0        Christian               None                  8.0
7    42364.0         Petrogad               None                  8.0
8    20473.0      Jeremy Boyd        Houston, TX                  8.0
9    19588.0       JD Isaacks        Atlanta, GA                  8.0


## Zadanie 4

In [55]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
#Users.to_sql("Users", conn)
Votes.to_sql("Votes", conn)
#Badges.to_sql("Badges", conn)

print(pd.read_sql_query(
"""SELECT
        Posts.Title,
        UpVotesPerYear.Year,
        MAX(UpVotesPerYear.Count) AS Count
    FROM 
        (
        SELECT
            PostId,
            COUNT(*) AS Count,
            STRFTIME('%Y', Votes.CreationDate) AS Year
        FROM Votes
        WHERE VoteTypeId=2
        GROUP BY PostId, Year
        ) AS UpVotesPerYear
    JOIN Posts ON Posts.Id=UpVotesPerYear.PostId
    WHERE Posts.PostTypeId=1
    GROUP BY Year
    ORDER BY Year ASC
""", conn) )
conn.close()

                                                Title  Year  Count
0   OK we're all adults here, so really, how on ea...  2011     70
1     How to successfully haggle / bargain in markets  2012     37
2   Why are airline passengers asked to lift up wi...  2013    103
3   How do you know if Americans genuinely/literal...  2014    179
4   Immigration officer that stopped me at the air...  2015    117
5   I don't know my nationality. How can I visit D...  2016    134
6                        Why prohibit engine braking?  2017    177
7   How can I find restaurants in the USA where ti...  2018    119
8   My name causes an issue with any booking! (nam...  2019    263
9   What's the longest distance that can be travel...  2020    110
10  Positive drug test in Singapore: How long do I...  2021     61


## Zadanie 5

In [56]:
if os.path.isfile(baza):
    os.remove(baza)
conn = sqlite3.connect(baza)

Posts.to_sql("Posts", conn)
#Users.to_sql("Users", conn)
Votes.to_sql("Votes", conn)
#Badges.to_sql("Badges", conn)

print(pd.read_sql_query(
"""SELECT
    Posts.Title,
    VotesByAge2.OldVotes
    FROM Posts
    JOIN (
        SELECT
            PostId,
            MAX(CASE WHEN VoteDate = 'new' THEN Total ELSE 0 END) NewVotes,
            MAX(CASE WHEN VoteDate = 'old' THEN Total ELSE 0 END) OldVotes,
            SUM(Total) AS Votes
        FROM (
            SELECT
              PostId,
              CASE STRFTIME('%Y', CreationDate)
                WHEN '2021' THEN 'new'
                WHEN '2020' THEN 'new'
                ELSE 'old'
              END VoteDate,
              COUNT(*) AS Total
            FROM Votes
            WHERE VoteTypeId IN (1, 2, 5)
            GROUP BY PostId, VoteDate
            ) AS VotesByAge
        GROUP BY VotesByAge.PostId
        HAVING NewVotes=0
        ) AS VotesByAge2 ON VotesByAge2.PostId=Posts.ID
    WHERE Posts.PostTypeId=1
    ORDER BY VotesByAge2.OldVotes DESC
    LIMIT 10
""", conn) )
conn.close()

                                               Title  OldVotes
0                       How to avoid drinking vodka?       200
1         How can I do a "broad" search for flights?       175
2  How to convince airport security that I am not...       119
3        Can I accidentally miss the in-flight food?       114
4  How can black travellers deal with unwanted ha...       108
5             Why are there no seat belts on trains?       106
6  How do you avoid "tourist traps" when travelin...       104
7                         Can I fly with a gold bar?       102
8  Best way to dispose of a carry-on bag in Hong ...        99
9    How to successfully haggle / bargain in markets        97
