In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, when
import nltk
from nltk.corpus import stopwords

In [4]:
# Configuration de NLTK
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/unamed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Création de la session Spark
spark = SparkSession.builder \
    .appName("Twitter Sentiment Analysis") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/09 11:41:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Définir le schéma pour le fichier CSV
schema = StructType([
    StructField("Tweet ID", IntegerType(), True),
    StructField("Entity", StringType(), True),
    StructField("Sentiment", StringType(), True),
    StructField("Tweet content", StringType(), True)
])

In [7]:
# Charger les données CSV avec le schéma spécifié
df = spark.read.csv("twitter_training.csv", header=True, schema=schema)

In [8]:
# Remplacer les valeurs nulles dans la colonne 'Tweet content' par une chaîne vide
df_cleaned = df.withColumn('Tweet content', when(col('Tweet content').isNull(), '').otherwise(col('Tweet content')))

In [9]:
# Définir les étapes de la pipeline
tokenizer = Tokenizer(inputCol="Tweet content", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
indexer = StringIndexer(inputCol="Sentiment", outputCol="label")
assembler = VectorAssembler(inputCols=["features"], outputCol="final_features")
lr = LogisticRegression(featuresCol='final_features', labelCol='label')

In [10]:
# Créer la pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, indexer, assembler, lr])

In [11]:
# Division des données en ensembles d'entraînement et de test
train_data, test_data = df_cleaned.randomSplit([0.8, 0.2], seed=123)

In [12]:
# Entraînement de la pipeline
pipeline_model = pipeline.fit(train_data)

24/05/09 11:41:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet ID but found: 2401
CSV file: file:///home/unamed/Projects/MST/bigData/twitter_training.csv
24/05/09 11:41:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet ID but found: 2401
CSV file: file:///home/unamed/Projects/MST/bigData/twitter_training.csv
24/05/09 11:41:27 WARN DAGScheduler: Broadcasting large task binary with size 18.7 MiB
24/05/09 11:41:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet

In [13]:
# Prédiction sur les données de test
predictions = pipeline_model.transform(test_data)

In [14]:
# Évaluation du modèle
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Logistic Regression model:", accuracy)

24/05/09 11:44:44 WARN DAGScheduler: Broadcasting large task binary with size 21.4 MiB
24/05/09 11:44:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet ID but found: 2401
CSV file: file:///home/unamed/Projects/MST/bigData/twitter_training.csv

Accuracy of Logistic Regression model: 0.8267346664877198


                                                                                

In [15]:
print(predictions.columns)


['Tweet ID', 'Entity', 'Sentiment', 'Tweet content', 'words', 'filtered_words', 'raw_features', 'features', 'label', 'final_features', 'rawPrediction', 'probability', 'prediction']


In [22]:
pipeline_model.save('pipeline_model')

24/05/09 12:09:51 WARN TaskSetManager: Stage 136 contains a task of very large size (4187 KiB). The maximum recommended task size is 1000 KiB.
24/05/09 12:09:52 WARN TaskSetManager: Stage 145 contains a task of very large size (2785 KiB). The maximum recommended task size is 1000 KiB.


In [2]:
from pyspark.ml import PipelineModel
model = PipelineModel.load('pipeline_model/')

Note: you may need to restart the kernel to use updated packages.
