In [11]:
# Use sqlite file for output
import sqlite3
import pandas
con = sqlite3.connect('Reddit_quality.db')
cur = con.cursor()

In [12]:
cur.execute("drop table if exists data;")
cur.execute(
"create table data( \
    AnswerCount int, \
    Score int, \
    AnswerRatio float \
);")
import csv
with open('all_submissions.csv','r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    counter = 0
    to_db = []
    for row in reader:
        title = row['title'].lower()
        if '[removed]' in title or '[deleted]' in title: continue
        selftext = row['selftext'].lower()
        if '[removed]' in selftext or '[deleted]' in selftext: continue
        Score = int(row['ups']) - int(row['downs'])
        AnswerCount = int(row['comments'])
        AnswerRatio = float(Score) / AnswerCount if AnswerCount > 0 else None
        to_db.append((AnswerCount, Score, AnswerRatio))
    cur.executemany("insert into data (AnswerCount, Score, AnswerRatio) VALUES (?, ?, ?);", to_db)

In [13]:
import numpy
AnswerRatioTable_NonNegScores = pandas.read_sql('select AnswerRatio from data where Score >= 0 AND AnswerRatio is NOT NULL', con)
Avg_AnswerRatioTable_NonNegScores = numpy.average(AnswerRatioTable_NonNegScores.values)
con.commit()
con.close()

In [15]:
# Make new database for training and testing
con = sqlite3.connect('Reddit.db')
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS data;")
cur.execute(
"create table data( \
    AnswerCount int, \
    Score int, \
    AnswerRatio float, \
    Body varchar, \
    Title varchar, \
    answer_good int, \
    answer_bad int \
);")
import csv
with open('all_submissions.csv','r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    to_db = []
    for row in reader:
        title = row['title'].lower()
        if '[removed]' in title or '[deleted]' in title: continue
        selftext = row['selftext'].lower()
        if '[removed]' in selftext or '[deleted]' in selftext: continue
        Score = int(row['ups']) - int(row['downs'])
        AnswerCount = int(row['comments'])
        AnswerRatio = float(Score) / AnswerCount if AnswerCount > 0 else None
        answer_good = AnswerCount > 0 and AnswerRatio > Avg_AnswerRatioTable_NonNegScores
        answer_bad = AnswerCount > 0 and Score <= 0
        to_db.append((AnswerCount, Score, AnswerRatio, selftext, title, int(answer_good), int(answer_bad)))
    cur.executemany("insert into data (AnswerCount, Score, AnswerRatio, Body, Title, answer_good, answer_bad) \
                    VALUES (?, ?, ?, ?, ?, ?, ?);", to_db)

print(pandas.read_sql('select * from data limit 5', con))
con.commit()
con.close()

   AnswerCount  Score  AnswerRatio Body  \
0            5      2          0.4        
1            1      0          0.0        
2           20      0          0.0        
3            1      2          2.0        
4            1     -2         -2.0        

                                               Title  answer_good  answer_bad  
0  askscience:  by what mechanism do waterfalls p...            0           0  
1  askscience: do you think principles of quantum...            0           1  
2  asksciencereddit: if an ice cube floated aroun...            0           1  
3  askscience:pepper+ginger in hot water for sore...            1           0  
4  askscience: i'm in the brainstorming stage of ...            0           1  


In [20]:
con = sqlite3.connect('Reddit.db')
cur = con.cursor()
count_by_category = []
for row in cur.execute('SELECT count(*), answer_good, answer_bad from data group by answer_good, answer_bad'):
    count_by_category.append(row[0])
    print(row)
# Print percentage where answergood is 1, answerbad is 0
# Print percentage where answergood is 0, answerbad is 1
# Print percentage where answergood is 0, answerbad is 0
# Does all_submissions already remove duplicates? Seems too high...

(468216, 0, 0)
(42397, 0, 1)
(67115, 1, 0)


In [22]:
total_count = sum(count_by_category)
print('Percent good: ' + str(count_by_category[2] / total_count))
print('Percent bad: ' + str(count_by_category[1] / total_count))
print('Percent unmarked: ' + str(count_by_category[0] / total_count))

Percent good: 0.11617058546582475
Percent bad: 0.07338574554115432
Percent unmarked: 0.8104436689930209
