In [60]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("data").getOrCreate()


In [62]:
from pathlib import Path
lymsys_folder = Path("/Users/hamzaboukhriss/Desktop/Bigdata/lymsys")
wildchat_folder = Path("/Users/hamzaboukhriss/Desktop/Bigdata/wildchat")

In [64]:

lymsys_df = spark.read.parquet(str(lymsys_folder))
lymsys_df.show(5)
wildchat_df = spark.read.parquet(str(wildchat_folder))
wildchat_df.show(5)


+--------------------+----------+-----------------------------------+----+----------+--------------------+--------+
|     conversation_id|     model|                       conversation|turn|  language|   openai_moderation|redacted|
+--------------------+----------+-----------------------------------+----+----------+--------------------+--------+
|c20948192699451b8...|vicuna-13b|               [{Given the artic...|   1|   English|[{{false, false, ...|    true|
|8b02aa2df44847a1b...|vicuna-33b|               [{Alright, let's ...|   2|   English|[{{false, false, ...|    true|
|c18f3612fac140cb9...|vicuna-13b|[{あなたが最終更新されたのはいつ...|   5|  Japanese|[{{false, false, ...|   false|
|d1fabb62e3364665a...| llama-13b|               [{You are an AI a...|   1|   English|[{{false, false, ...|    true|
|b2da335248f04e439...| koala-13b|               [{Reescreva esse ...|   1|Portuguese|[{{false, false, ...|   false|
+--------------------+----------+-----------------------------------+----+----------+--

In [66]:
from pyspark.sql.functions import when

# Create a column to assign device type based on user-agent (desktop vs mobile)
a_df = wildchat_df.withColumn(
	"device_type",
	when(
		F.lower(F.col("header.user-agent")).like("%mobile%"),
		"mobile"
	).otherwise("desktop")
)



a_df.show(16)


+--------------------+------------------+-------------------+-----------------------------------+----+--------+--------------------+--------------------+-----+--------+-------------+---------------+--------------------+--------------------+-----------+
|   conversation_hash|             model|          timestamp|                       conversation|turn|language|   openai_moderation| detoxify_moderation|toxic|redacted|        state|        country|           hashed_ip|              header|device_type|
+--------------------+------------------+-------------------+-----------------------------------+----+--------+--------------------+--------------------+-----+--------+-------------+---------------+--------------------+--------------------+-----------+
|f01a345e668b978b6...|gpt-3.5-turbo-0613|2023-09-13 04:14:27|               [{(In the clubroo...|   1| English|[{{false, false, ...|[{2.1957975695841...|false|   false|     Michigan|  United States|f96c8515aa663db23...|{en-US,en;q=0.9, ...| 

In [68]:
# Count the number of person using desktop and mobile as a device 
device_counts = a_df.groupBy("device_type").count()

device_counts.show()

+-----------+------+
|device_type| count|
+-----------+------+
|    desktop|765538|
|     mobile|169813|
+-----------+------+



In [73]:
#regional Preferences  
device_by_country_pivot = a_df.groupBy("country") \
    .pivot("device_type", ["mobile", "desktop"]) \
    .count() \
    .fillna(0) 

device_by_country_pivot.show()



+--------------------+------+-------+
|             country|mobile|desktop|
+--------------------+------+-------+
|              Russia| 25067| 111711|
|            Paraguay|    25|     35|
|               Macao|   301|    771|
|               Yemen|   429|    516|
|             Senegal|     4|     39|
|              Sweden|   283|   1454|
|          Cabo Verde|     1|      3|
|     The Netherlands|  2554|   7742|
|              Guyana|     0|     36|
|         Philippines|  2430|   7428|
|             Eritrea|     2|      5|
|              Jersey|     1|      0|
|            Djibouti|     0|      1|
|           Singapore|  1223|  11514|
|            Malaysia|   449|   3204|
|                Fiji|     1|     15|
|              Malawi|     1|      4|
|                Iraq|   335|    540|
|             Germany|  2906|  31575|
|Northern Mariana ...|     2|      0|
+--------------------+------+-------+
only showing top 20 rows



                                                                                