# Count trial algorithm and correct rate in individual users and tier groups

#### Key feature
- Generates counts for trial algorithm and correct rate in individual users and tier groups

### Configurations

In [1]:
import logging

logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',

    datefmt='%Y-%m-%d,%H:%M:%S:%f', level=logging.INFO)

In [2]:
logging.info('Application Start!!')

2022-06-10,00:45:52:%f INFO Application Start!!


### Get trials

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark import SparkConf, SparkContext, SQLContext

conf = SparkConf().setMaster("yarn").setAppName("bigdata-group1-top-algo")
sc = SparkContext.getOrCreate(conf = conf)
sqlContext = SQLContext(sc)


In [5]:
trial_df = sqlContext.read.format("csv").option("encoding", "UTF-8").option("header","true").load("data/trials.csv")

In [6]:
trial_df = trial_df.na.drop(subset=["type"])

In [7]:
trial_df.printSchema()

root
 |-- user: string (nullable = true)
 |-- user_rank: string (nullable = true)
 |-- questionId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- difficulty: string (nullable = true)
 |-- type: string (nullable = true)
 |-- result: string (nullable = true)
 |-- memory: string (nullable = true)
 |-- time: string (nullable = true)
 |-- language: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- date: string (nullable = true)



In [15]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
trial_split_df = trial_df.withColumn("type", F.split(F.col("type"), '/'))

def cut_tail(array):
    return array[:-1]
cut_tail_udf = F.udf(cut_tail, ArrayType(StringType()))
trial_typelist_df = trial_split_df.withColumn("type", cut_tail_udf("type"))

In [21]:
correct_keyword = ["맞았습니다"]
wrong_keyword = ["초과", "틀렸습니다"]
error_keyword = ["런타임", "컴파일", "출력"]
def categorize(x):
    for keyword in correct_keyword:
        if keyword in x:
            return "맞았습니다"
    for keyword in wrong_keyword:
        if keyword in x:
            return "틀렸습니다"
    for keyword in error_keyword:
        if keyword in x:
            return "에러"
    return "기타"

categorize_udf = F.udf(categorize, StringType())
categorized_df = trial_typelist_df.withColumn("result", categorize_udf(F.col("result")))

In [25]:
cor_wro_df = categorized_df.withColumn("type", F.explode("type")).filter((F.col("result") == "맞았습니다") | (F.col("result") == "틀렸습니다"))

In [49]:
user_trial_df =  cor_wro_df.groupBy('user','user_rank', "type", "result").count()
user_trial_df.persist()

DataFrame[user: string, user_rank: string, type: string, result: string, count: bigint]

In [56]:
user_stat_df = user_trial_df.groupBy('user','user_rank' , 'type') \
.agg(F.sum("count").alias("submit"), \
     F.sum(F.when(F.col("result") == "맞았습니다", F.col("count"))).alias("correct"), \
     F.sum(F.when(F.col("result") == "틀렸습니다", F.col("count"))).alias("wrong") \
    ).na.fill(0) \
   .withColumn("rate", F.col("correct") / F.col("submit")) \
   .sort("user", F.col("rate").desc())

In [57]:
user_stat_df.write.format("csv").mode('overwrite') \
    .option("encoding", "UTF-8").save("static/stat_correct/algo_correct_stat_user")

In [51]:
group_trial_df =  cor_wro_df.groupBy('user_rank', "type", "result").count()
group_trial_df.persist()

DataFrame[user_rank: string, type: string, result: string, count: bigint]

In [59]:
group_stat_df = group_trial_df.groupBy('user_rank' , 'type') \
.agg(F.sum("count").alias("submit"), \
     F.sum(F.when(F.col("result") == "맞았습니다", F.col("count"))).alias("correct"), \
     F.sum(F.when(F.col("result") == "틀렸습니다", F.col("count"))).alias("wrong") \
    ).na.fill(0) \
   .withColumn("rate", F.col("correct") / F.col("submit")) \
   .sort("user_rank", F.col("rate").desc())

In [60]:
group_stat_df.write.format("csv").mode('overwrite') \
    .option("encoding", "UTF-8").save("static/stat_correct/algo_correct_stat_rankgroup")

In [15]:
sc.stop()