In [None]:
!apt-get install openjdk-8-jdk-headless -qq

In [None]:
!pip install pyspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").getOrCreate()


In [None]:
from pyspark.sql import Row

data = [('Alice', 1), ('Bob', 2), ('Charlie', 3)]
rdd = spark.sparkContext.parallelize(data)
people = rdd.map(lambda x: Row(name=x[0], age=x[1]))
df = spark.createDataFrame(people)
df.show()


+-------+---+
|   name|age|
+-------+---+
|  Alice|  1|
|    Bob|  2|
|Charlie|  3|
+-------+---+



In [None]:
!pip install findspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os

os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk1.8.0_361"
os.environ["SPARK_HOME"]= "C:\spark-3.3.2-bin-hadoop3"


In [None]:
import findspark
findspark.init("C:\spark-3.3.2-bin-hadoop3")


#Age and Gender Distribution:

In [None]:
from pyspark.sql.functions import count

# Read the CSV file into a DataFrame
df = spark.read.csv("pseudo_facebook.csv", header=True, inferSchema=True)

# Group by age and gender and count the number of users in each group
age_gender_counts = df.groupBy("age", "gender").agg(count("userid").alias("count"))

# Display the results
age_gender_counts.show()


+---+------+-----+
|age|gender|count|
+---+------+-----+
| 24|  male| 1852|
| 70|  male|  141|
|108|female|  624|
| 70|    NA|    3|
|108|  male| 1016|
| 58|    NA|    3|
| 62|female|  410|
|108|    NA|   21|
| 99|female|   25|
| 38|female|  396|
| 81|female|   59|
| 56|female|  476|
| 42|  male|  507|
| 76|    NA|    1|
| 26|    NA|    1|
| 92|female|   17|
| 75|    NA|    3|
| 18|  male| 3159|
| 32|female|  524|
| 52|  male|  489|
+---+------+-----+
only showing top 20 rows



#INFERENCE
Analysis of age distribution: This analysis can help us understand the age range of users on the platform. If we find that the age distribution is skewed towards a particular age group, it may indicate that the platform is more popular among users of a certain age.

The age distribution of users is heavily skewed towards users in their 20s and 30s, and the platform is slightly more popular among females than males.

#Friend Count and Friendships Initiated:

In [None]:
from pyspark.sql.functions import corr

# Read the CSV file into a DataFrame
# df = spark.read.csv("pseudo_facebook.csv", header=True, inferSchema=True)

# Calculate the correlation between friend count and friendships initiated
corr_value = df.corr("friend_count", "friendships_initiated")

# Display the correlation value
print(corr_value)


0.8258499569806989


#INFERENCE
Users are more proactive in initiating friendships than the number of friends they have.

By analyzing the distribution of friend count and friendships initiated, we can infer whether users tend to initiate more friendships than the number of friends they have or vice versa. If we find that the average number of friendships initiated is higher than the average friend count, it suggests that users are more proactive in initiating friendships. Conversely, if the average friend count is higher than the average number of friendships initiated, it suggests that users are more passive in making new friends.

#Likes and Likes Received:

In [None]:

import pyspark.sql.functions as F
from pyspark.sql.functions import desc

# Read the CSV file into a DataFrame
df = spark.read.csv("pseudo_facebook.csv", header=True, inferSchema=True)

# Calculate the total number of likes given and likes received
likes_given = df.agg(F.sum("likes").alias("likes_given"))
likes_received = df.agg(F.sum("likes_received").alias("likes_received"))

# Display the results
likes_given.show()
likes_received.show()

# Display the top users with the most likes given and likes received
top_users_given = df.select("userid", "likes").orderBy(desc("likes")).limit(10)
top_users_received = df.select("userid", "likes_received").orderBy(desc("likes_received")).limit(10)
top_users_given.show()
top_users_received.show()


+-----------+
|likes_given|
+-----------+
|   15452268|
+-----------+

+--------------+
|likes_received|
+--------------+
|      14126675|
+--------------+

+-------+-----+
| userid|likes|
+-------+-----+
|1684195|25111|
|1656477|21652|
|1489463|16732|
|1429178|16583|
|1267229|14799|
|1783264|14355|
|1002588|14050|
|1412849|14039|
|1878566|13692|
|2104503|13622|
+-------+-----+

+-------+--------------+
| userid|likes_received|
+-------+--------------+
|1674584|        261197|
|1441676|        178166|
|1715925|        152014|
|2063006|        106025|
|1053087|         82623|
|1432020|         53534|
|2042824|         52964|
|1559908|         45633|
|1781243|         42449|
|1015907|         39536|
+-------+--------------+



#INFERENCE
Users tend to receive more appreciation than they give.

By analyzing the distribution of likes and likes received, we can understand how users interact with each other on the platform. If we find that the average number of likes received is higher than the average number of likes given, it suggests that users tend to receive more appreciation than they give. Conversely, if the average number of likes given is higher than the average number of likes received, it suggests that users tend to be more generous in showing appreciation.

#Mobile Likes and Mobile Likes Received:

In [None]:
from pyspark.sql.functions import sum, col

# Read the CSV file into a DataFrame
# df = spark.read.csv("pseudo_facebook.csv", header=True, inferSchema=True)

# Calculate the total number of mobile likes given and mobile likes received
mobile_likes_given = df.agg(sum(col("mobile_likes")).alias("mobile_likes_given"))
mobile_likes_received = df.agg(sum(col("mobile_likes_received")).alias("mobile_likes_received"))

# Display the results
mobile_likes_given.show()
mobile_likes_received.show()


+------------------+
|mobile_likes_given|
+------------------+
|          10505832|
+------------------+

+---------------------+
|mobile_likes_received|
+---------------------+
|              8328181|
+---------------------+



#INFERENCE
 The number of mobile likes and mobile likes received is significantly lower than the number of www likes and www likes received. This suggests that most users prefer to use the desktop version of the platform to access and like content.

#WWW Likes and WWW Likes Received:

In [None]:
from pyspark.sql.functions import sum, col

# Read the CSV file into a DataFrame
# df = spark.read.csv("pseudo_facebook.csv", header=True, inferSchema=True)

# Calculate the total number of web likes given and web likes received
www_likes_given = df.agg(sum(col("www_likes")).alias("www_likes_given"))
www_likes_received = df.agg(sum(col("www_likes_received")).alias("www_likes_received"))

# Display the results
www_likes_given.show()
www_likes_received.show()


+---------------+
|www_likes_given|
+---------------+
|        4946430|
+---------------+

+------------------+
|www_likes_received|
+------------------+
|           5798490|
+------------------+



#INFERENCE
The number of www likes and www likes received is much higher than the number of mobile likes and mobile likes received. This suggests that most users prefer to use the desktop version of the platform to access and like content.

#Tenure:

In [None]:
from pyspark.sql.functions import avg

# Read the CSV file into a DataFrame
# df = spark.read.csv("pseudo_facebook.csv", header=True, inferSchema=True)

# Calculate the average tenure for each gender
avg_tenure_by_gender = df.groupBy("gender").agg(avg("tenure").alias("avg_tenure"))

# Display the results
avg_tenure_by_gender.show()


+------+------------------+
|gender|        avg_tenure|
+------+------------------+
|    NA|1801.5142857142857|
|female| 587.2292308456723|
|  male|500.20439102673544|
+------+------------------+



#INFERENCE
 The average tenure of users is around 1 year, and there are no significant differences in tenure between males and females. This suggests that users tend to stay active on the platform for a relatively short period of time before moving on to other social media platforms.

Tenure refers to the amount of time a user has been active on the social media platform. In the context of this dataset, tenure represents the number of days between the user's registration date and the date when the data was collected. A higher tenure value indicates that the user has been active on the platform for a longer period of time.

#SENTIMENT ANALYSIS

In [None]:
from pyspark.sql.functions import col, when

# load the Facebook dataset
df = spark.read.csv('/content/pseudo_facebook.csv', sep=',', header=True)

In [None]:
# drop rows with missing values
df = df.na.drop()

In [None]:
# convert age to integer and gender to binary
df = df.withColumn('age', df['age'].cast('integer'))
df = df.withColumn('gender', when(col('gender') == 'male', 1).otherwise(0))

In [None]:
df.show(5)

+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
| userid|age|dob_day|dob_year|dob_month|gender|tenure|friend_count|friendships_initiated|likes|likes_received|mobile_likes|mobile_likes_received|www_likes|www_likes_received|
+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|2094382| 14|     19|    1999|       11|     1|   266|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|1192601| 14|      2|    1999|       11|     0|     6|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|2083884| 14|     16|    1999|       11|     1|    13|           0|                    0|    0|             0|           0|  

In [None]:
# calculate the total number of likes received
df = df.withColumn('total_likes_received', col('likes_received') + col('www_likes_received'))

In [None]:
# select the relevant columns for sentiment analysis
df = df.select('userid', 'gender', 'age', 'tenure', 'friend_count', 'friendships_initiated', 'total_likes_received')


In [None]:
# show the first 5 rows of the dataframe
df.show(5)

+-------+------+---+------+------------+---------------------+--------------------+
| userid|gender|age|tenure|friend_count|friendships_initiated|total_likes_received|
+-------+------+---+------+------------+---------------------+--------------------+
|2094382|     1| 14|   266|           0|                    0|                 0.0|
|1192601|     0| 14|     6|           0|                    0|                 0.0|
|2083884|     1| 14|    13|           0|                    0|                 0.0|
|1203168|     0| 14|    93|           0|                    0|                 0.0|
|1733186|     1| 14|    82|           0|                    0|                 0.0|
+-------+------+---+------+------------+---------------------+--------------------+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
# combine all text columns into a single column for text analysis
df = df.withColumn("new_text", col("userid").cast("string") + " " + col("age").cast("string") + " " + col("gender").cast("string") + " " + col("tenure").cast("string") + " " + col("friend_count").cast("string") + " " + col("friendships_initiated").cast("string") + " " + col("total_likes_received").cast("string"))


In [None]:
type(df)

pyspark.sql.dataframe.DataFrame

In [None]:
# split the text into tokens
tokenizer = Tokenizer(inputCol="new_text", outputCol="words")
df = tokenizer.transform(df)

IllegalArgumentException: ignored

In [None]:
# from pyspark.sql.functions import col
# from pyspark.ml.feature import Tokenizer, StopWordsRemover
# from pyspark.ml.classification import NaiveBayes
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# # load the preprocessed dataframe
# df = spark.read.csv('/path/to/your/preprocessed_dataframe.csv', header=True)

# # select only the columns needed for analysis
# df = df.select("userid", "age", "gender", "tenure", "friend_count", "friendships_initiated", "total_likes_received")

# # combine all text columns into a single column for text analysis
# df = df.withColumn("text", col("userid") + " " + col("age").cast("string") + " " + col("gender") + " " + col("tenure").cast("string") + " " + col("friend_count").cast("string") + " " + col("friendships_initiated").cast("string") + " " + col("total_likes_received").cast("string"))

# # split the text into tokens
# tokenizer = Tokenizer(inputCol="text", outputCol="words")
# df = tokenizer.transform(df)

# # remove stop words
# remover = StopWordsRemover(inputCol="words", outputCol="filtered")
# df = remover.transform(df)

# # create the Naive Bayes model
# nb = NaiveBayes(smoothing=1.0, modelType="multinomial", featuresCol="filtered", labelCol="sentiment")

# # split the dataset into training and test sets
# (trainingData, testData) = df.randomSplit([0.8, 0.2], seed=1234)

# # train the model
# model = nb.fit(trainingData)

# # make predictions on the test set
# predictions = model.transform(testData)

# # evaluate the accuracy of the model
# evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="sentiment", metricName="accuracy")
# accuracy = evaluator.evaluate(predictions)
# print("Accuracy:", accuracy)
